From 08c8f60cec923762204b6b72a27e9bb3426c3311 Mon Sep 17 00:00:00 2001 From: Menno Date: Mon, 16 Feb 2026 13:19:04 +0100 Subject: [PATCH 1/9] in single precision, very low optical depths may result in negative sources. Limit source_up/source_dn to zero in longwave 2-stream kernel --- src_kernels_cuda_rt/rte_solver_kernels_rt.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src_kernels_cuda_rt/rte_solver_kernels_rt.cu b/src_kernels_cuda_rt/rte_solver_kernels_rt.cu index 85f3f299..b5474a69 100644 --- a/src_kernels_cuda_rt/rte_solver_kernels_rt.cu +++ b/src_kernels_cuda_rt/rte_solver_kernels_rt.cu @@ -593,8 +593,9 @@ void lw_source_2stream_kernel( const Float Zdn_top = -Z + lev_source[idx_lev2]; const Float Zdn_bot = -Z + lev_source[idx_lev1]; - source_up[idx_lay] = pi * (Zup_top - r_dif[idx_lay] * Zdn_top - t_dif[idx_lay] * Zup_bot); - source_dn[idx_lay] = pi * (Zdn_bot - r_dif[idx_lay] * Zup_bot - t_dif[idx_lay] * Zdn_top); + source_up[idx_lay] = max( pi * (Zup_top - r_dif[idx_lay] * Zdn_top - t_dif[idx_lay] * Zup_bot) , Float(0.)); + source_dn[idx_lay] = max( pi * (Zdn_bot - r_dif[idx_lay] * Zup_bot - t_dif[idx_lay] * Zdn_top) , Float(0.)); + } } From 58e006b341b9cf27c1171bf5db48df3fca1d201e Mon Sep 17 00:00:00 2001 From: Menno Date: Tue, 17 Feb 2026 16:12:41 +0100 Subject: [PATCH 2/9] print number of g-points solved in 3D with Monte Carlo to screen --- src_test/radiation_solver_rt.cu | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src_test/radiation_solver_rt.cu b/src_test/radiation_solver_rt.cu index fb70dfb2..eb260923 100644 --- a/src_test/radiation_solver_rt.cu +++ b/src_test/radiation_solver_rt.cu @@ -545,6 +545,8 @@ void Radiation_solver_longwave::solve_gpu( const Array& band_limits_gpt(this->kdist_gpu->get_band_lims_gpoint()); int previous_band = 0; + int monte_carlo_gpoints = 0; + for (int igpt=1; igpt<=n_gpt; ++igpt) { int band = 0; @@ -603,6 +605,8 @@ void Radiation_solver_longwave::solve_gpu( } + // Find maximum gasous optical depth to and compute lowest mean free path on the clearsky atmosphere + // Allocate temporary storage void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; @@ -744,6 +748,8 @@ void Radiation_solver_longwave::solve_gpu( if ( (lowest_gas_mean_free_path / grid_d_xy_min) > min_mfp_grid_ratio) { + ++monte_carlo_gpoints; + raytracer_lw.trace_rays( igpt, switch_independent_column, @@ -791,11 +797,13 @@ void Radiation_solver_longwave::solve_gpu( Gpt_combine_kernels_cuda_rt::add_from_gpoint( n_col, grid_cells.z, rt_flux_abs.ptr(), (*fluxes).get_flux_abs_dif().ptr()); + } } Tools_gpu::free_gpu(max_tau_gas_g); } + Status::print_message("Solved "+ std::to_string(monte_carlo_gpoints)+" out of "+std::to_string(n_gpt)+" g-points in 3D with Monte Carlo"); } Radiation_solver_shortwave::Radiation_solver_shortwave( From 18da8048a7ba24ca0f17b299daa7701670330b5b Mon Sep 17 00:00:00 2001 From: Menno Date: Tue, 17 Feb 2026 17:09:53 +0100 Subject: [PATCH 3/9] split raytracing cmd line flag and switch to a sw and lw version. Number of samples of lw monte carlo can now be provided via cmd line, e.g. --lw-raytracing 20 to use 2**20 samples per spectral quadrature points --- include_rt/raytracer_lw.h | 2 +- src_cuda_rt/raytracer_lw.cu | 27 ++++++++++++-- src_test/test_rte_rrtmgp_rt.cu | 68 +++++++++++++++++++++++----------- 3 files changed, 70 insertions(+), 27 deletions(-) diff --git a/include_rt/raytracer_lw.h b/include_rt/raytracer_lw.h index cc24d5f9..eba76d6b 100644 --- a/include_rt/raytracer_lw.h +++ b/include_rt/raytracer_lw.h @@ -24,7 +24,7 @@ class Raytracer_lw void trace_rays( const int igpt, const bool switch_independent_column, - const Int n_photons, + const Int lw_photon_count, const Raytracer_definitions::Vector grid_cells, const Raytracer_definitions::Vector grid_d, const Raytracer_definitions::Vector kn_grid, diff --git a/src_cuda_rt/raytracer_lw.cu b/src_cuda_rt/raytracer_lw.cu index 1b9dfdae..df50dfdf 100644 --- a/src_cuda_rt/raytracer_lw.cu +++ b/src_cuda_rt/raytracer_lw.cu @@ -186,7 +186,7 @@ namespace void Raytracer_lw::trace_rays( const int igpt, const bool switch_independent_column, - const Int photons_per_pixel, + const Int lw_photon_count, const Vector grid_cells, const Vector grid_d, const Vector kn_grid, @@ -322,10 +322,29 @@ void Raytracer_lw::trace_rays( // actual_photons_per_pixel = photons_per_thread * rt_kernel_grid_size * rt_kernel_block_size / (qrng_grid_x * qrng_grid_y); // } - dim3 grid(rt_lw_kernel_grid); - dim3 block(rt_lw_kernel_block); + Int photons_per_thread; + Int rt_kernel_grid_size = rt_lw_kernel_grid; + Int rt_kernel_block_size = rt_lw_kernel_block; + + if (rt_lw_kernel_grid*rt_lw_kernel_block < lw_photon_count) + { + photons_per_thread = lw_photon_count / (rt_lw_kernel_grid*rt_lw_kernel_block); + } + else if (lw_photon_count > rt_lw_kernel_block) + { + photons_per_thread = 1; + rt_kernel_grid_size = lw_photon_count / rt_lw_kernel_grid; + } + else + { + photons_per_thread = 1; + rt_kernel_grid_size = 1; + rt_kernel_block_size = lw_photon_count; + } + + dim3 grid(rt_kernel_grid_size); + dim3 block(rt_kernel_block_size); - const Int photons_per_thread = 1024; const Float rng_offset = igpt*rt_lw_kernel_grid*rt_lw_kernel_block; auto run_raytracer = [&]( diff --git a/src_test/test_rte_rrtmgp_rt.cu b/src_test/test_rte_rrtmgp_rt.cu index 4e5880d9..f9926d4a 100644 --- a/src_test/test_rte_rrtmgp_rt.cu +++ b/src_test/test_rte_rrtmgp_rt.cu @@ -229,7 +229,8 @@ void solve_radiation(int argc, char** argv) {"longwave" , { false, "Enable computation of longwave radiation." }}, {"fluxes" , { true, "Enable computation of fluxes." }}, {"two-stream" , { true, "Run two-stream solver for to obtain 1D fluxes" }}, - {"raytracing" , { true, "Use raytracing for flux computation. '--raytracing 256': use 256 rays per pixel per spectral quadrature point" }}, + {"sw-raytracing" , { true, "Use shortwave raytracing for flux computation. '--sw-raytracing 256': use 256 rays per pixel per spectral quadrature point" }}, + {"lw-raytracing" , { true, "Use longwave raytracing for flux computation. '--lw-raytracing 22': use a total of 2**22 rays per spectral quadrature point" }}, {"independent-column", { false, "run raytracer in independent column mode"}}, {"cloud-optics" , { false, "Enable cloud optics (both liquid and ice)."}}, {"liq-cloud-optics" , { false, "liquid only cloud optics." }}, @@ -245,7 +246,8 @@ void solve_radiation(int argc, char** argv) {"tica" , { false, "attenuate path when doing an overhead 1D calculation of tilted input" }}}; std::map command_line_numbers { - {"raytracing", 32}, + {"sw-raytracing", 256}, + {"lw-raytracing", 22}, {"single-gpt", 1}, {"min-mfp-grid-ratio", 1}}; @@ -256,7 +258,8 @@ void solve_radiation(int argc, char** argv) const bool switch_longwave = command_line_switches.at("longwave" ).first; const bool switch_fluxes = command_line_switches.at("fluxes" ).first; const bool switch_twostream = command_line_switches.at("two-stream" ).first; - const bool switch_raytracing = command_line_switches.at("raytracing" ).first; + bool switch_sw_raytracing = command_line_switches.at("sw-raytracing" ).first; + bool switch_lw_raytracing = command_line_switches.at("lw-raytracing" ).first; bool switch_independent_column= command_line_switches.at("independent-column").first; bool switch_cloud_optics = command_line_switches.at("cloud-optics" ).first; bool switch_liq_cloud_optics = command_line_switches.at("liq-cloud-optics" ).first; @@ -271,16 +274,36 @@ void solve_radiation(int argc, char** argv) const bool switch_delta_aerosol = command_line_switches.at("delta-aerosol" ).first; const bool switch_tica = command_line_switches.at("tica" ).first; - Int photons_per_pixel = Int(command_line_numbers.at("raytracing")); - if (Float(int(std::log2(Float(photons_per_pixel)))) != std::log2(Float(photons_per_pixel))) + + if (!switch_shortwave) + switch_sw_raytracing = false; + + if (!switch_longwave) + switch_lw_raytracing = false; + + if (switch_shortwave && !switch_twostream && !switch_sw_raytracing) { - std::string error = "number of photons per pixel should be a power of 2 "; + std::string error = "With shortwave enable, need to run either two-stream solver or ray tracer "; throw std::runtime_error(error); } - if (!switch_twostream && !switch_raytracing) { - std::string error = "cannot disable two-stream for flux calculation without turning ray tracing on"; - throw std::runtime_error(error); + Int sw_photons_per_pixel; + if (switch_sw_raytracing) + { + sw_photons_per_pixel = Int(command_line_numbers.at("sw-raytracing")); + if (Float(int(std::log2(Float(sw_photons_per_pixel)))) != std::log2(Float(sw_photons_per_pixel))) + { + std::string error = "number of photons per pixel should be a power of 2 "; + throw std::runtime_error(error); + } + } + + Int lw_photon_power; + Int lw_photon_count; + if (switch_lw_raytracing) + { + lw_photon_power = Int(command_line_numbers.at("lw-raytracing")); + lw_photon_count = 1 << lw_photon_power; } if (switch_cloud_optics) @@ -289,14 +312,10 @@ void solve_radiation(int argc, char** argv) switch_ice_cloud_optics = true; } if (switch_liq_cloud_optics || switch_ice_cloud_optics) - { switch_cloud_optics = true; - } if (switch_tica) - { switch_independent_column = true; - } if (switch_cloud_mie && switch_ice_cloud_optics) { @@ -311,7 +330,12 @@ void solve_radiation(int argc, char** argv) const Float min_mfp_grid_ratio = switch_min_mfp_grid_ratio ? command_line_numbers.at("min-mfp-grid-ratio") : Float(0.); - Status::print_message("Using "+ std::to_string(photons_per_pixel) + " rays per pixel"); + if (switch_sw_raytracing) + Status::print_message("Shortwave: using "+ std::to_string(sw_photons_per_pixel) + " rays per pixel per g-point"); + + if (switch_lw_raytracing) + Status::print_message("Longwave: using 2**"+std::to_string(lw_photon_power) + " ("+std::to_string(lw_photon_count) + ") rays per g-point"); + ////// READ THE ATMOSPHERIC DATA ////// Status::print_message("Reading atmospheric input data from NetCDF."); @@ -679,7 +703,7 @@ void solve_radiation(int argc, char** argv) Array_gpu rt_flux_sfc_dn; Array_gpu rt_flux_abs; - if (switch_raytracing) + if (switch_lw_raytracing) { rt_flux_tod_up.set_dims({n_col_x, n_col_y}); rt_flux_tod_dn.set_dims({n_col_x, n_col_y}); @@ -724,7 +748,7 @@ void solve_radiation(int argc, char** argv) rad_lw.solve_gpu( switch_fluxes, - switch_raytracing, + switch_lw_raytracing, switch_cloud_optics, switch_aerosol_optics, switch_single_gpt, @@ -732,7 +756,7 @@ void solve_radiation(int argc, char** argv) switch_independent_column, single_gpt, min_mfp_grid_ratio, - photons_per_pixel, + lw_photon_count, grid_cells, grid_d, kn_grid, @@ -879,7 +903,7 @@ void solve_radiation(int argc, char** argv) nc_lw_gpt_flux_net.insert(lw_gpt_flux_net_cpu.v(), {0, 0, 0}); } - if (switch_raytracing) + if (switch_lw_raytracing) { auto rt_flux_tod_up = output_nc.add_variable("rt_lw_flux_tod_up" , { "y", "x"}); auto rt_flux_tod_dn = output_nc.add_variable("rt_lw_flux_tod_dn" , { "y", "x"}); @@ -991,7 +1015,7 @@ void solve_radiation(int argc, char** argv) sw_flux_net .set_dims({n_col, n_lev}); } - if (switch_raytracing) + if (switch_sw_raytracing) { rt_flux_tod_up .set_dims({n_col_x, n_col_y}); rt_flux_sfc_dir.set_dims({n_col_x, n_col_y}); @@ -1053,7 +1077,7 @@ void solve_radiation(int argc, char** argv) rad_sw.solve_gpu( switch_fluxes, switch_twostream, - switch_raytracing, + switch_sw_raytracing, switch_independent_column, switch_cloud_optics, switch_cloud_mie, @@ -1063,7 +1087,7 @@ void solve_radiation(int argc, char** argv) switch_delta_aerosol, switch_tica, single_gpt, - photons_per_pixel, + sw_photons_per_pixel, grid_cells, grid_d, kn_grid, @@ -1217,7 +1241,7 @@ void solve_radiation(int argc, char** argv) } - if (switch_raytracing) + if (switch_sw_raytracing) { auto nc_rt_flux_tod_up = output_nc.add_variable("rt_flux_tod_up", {"y","x"}); auto nc_rt_flux_sfc_dir = output_nc.add_variable("rt_flux_sfc_dir", {"y","x"}); From 6616149f7960f3a3ac283b1b8736cdf6047bfba9 Mon Sep 17 00:00:00 2001 From: Menno Date: Wed, 18 Feb 2026 10:03:50 +0100 Subject: [PATCH 4/9] remove unused variable and add explicit cast to float, otherwise compilers throws a warning --- src_cuda_rt/raytracer_lw.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src_cuda_rt/raytracer_lw.cu b/src_cuda_rt/raytracer_lw.cu index df50dfdf..155905b1 100644 --- a/src_cuda_rt/raytracer_lw.cu +++ b/src_cuda_rt/raytracer_lw.cu @@ -269,8 +269,6 @@ void Raytracer_lw::trace_rays( build_alias_table(power_sfc.ptr(), n_2d, alias_prob_sfc.ptr(), alias_idx_sfc.ptr(), total_power_sfc); build_alias_table(power_tod.ptr(), n_2d, alias_prob_tod.ptr(), alias_idx_tod.ptr(), total_power_tod); - const Float total_power = total_power_atm + total_power_sfc + total_power_tod; - const int block_kn_x = 8; const int block_kn_y = 8; const int block_kn_z = 4; @@ -378,7 +376,7 @@ void Raytracer_lw::trace_rays( emis_sfc.ptr(), grid_size, grid_d, grid_cells, kn_grid); - const Float power_per_photon = total_power_src / (photons_per_thread * rt_lw_kernel_grid * rt_lw_kernel_block); + const Float power_per_photon = Float(total_power_src / photons_per_thread * rt_lw_kernel_grid * rt_lw_kernel_block); count_to_flux_2d<<>>( grid_cells, From b3afe096aa061f33e2409b2375dcd5038003cd0e Mon Sep 17 00:00:00 2001 From: Menno Date: Wed, 18 Feb 2026 10:35:49 +0100 Subject: [PATCH 5/9] set longwave switch to true by default --- src_test/test_rte_rrtmgp_rt.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src_test/test_rte_rrtmgp_rt.cu b/src_test/test_rte_rrtmgp_rt.cu index f9926d4a..e95bb140 100644 --- a/src_test/test_rte_rrtmgp_rt.cu +++ b/src_test/test_rte_rrtmgp_rt.cu @@ -226,7 +226,7 @@ void solve_radiation(int argc, char** argv) // Parse the command line options. std::map> command_line_switches { {"shortwave" , { true, "Enable computation of shortwave radiation."}}, - {"longwave" , { false, "Enable computation of longwave radiation." }}, + {"longwave" , { true, "Enable computation of longwave radiation." }}, {"fluxes" , { true, "Enable computation of fluxes." }}, {"two-stream" , { true, "Run two-stream solver for to obtain 1D fluxes" }}, {"sw-raytracing" , { true, "Use shortwave raytracing for flux computation. '--sw-raytracing 256': use 256 rays per pixel per spectral quadrature point" }}, @@ -276,7 +276,10 @@ void solve_radiation(int argc, char** argv) if (!switch_shortwave) + { switch_sw_raytracing = false; + switch_twostream = false; + } if (!switch_longwave) switch_lw_raytracing = false; From 1aa19ae3794951764796605be6cdadf56545e744 Mon Sep 17 00:00:00 2001 From: Menno Date: Wed, 18 Feb 2026 13:23:53 +0100 Subject: [PATCH 6/9] oops, accidently removed bracket --- src_cuda_rt/raytracer_lw.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src_cuda_rt/raytracer_lw.cu b/src_cuda_rt/raytracer_lw.cu index 155905b1..e6f972db 100644 --- a/src_cuda_rt/raytracer_lw.cu +++ b/src_cuda_rt/raytracer_lw.cu @@ -376,7 +376,7 @@ void Raytracer_lw::trace_rays( emis_sfc.ptr(), grid_size, grid_d, grid_cells, kn_grid); - const Float power_per_photon = Float(total_power_src / photons_per_thread * rt_lw_kernel_grid * rt_lw_kernel_block); + const Float power_per_photon = Float(total_power_src / (photons_per_thread * rt_lw_kernel_grid * rt_lw_kernel_block)); count_to_flux_2d<<>>( grid_cells, From c2088e43b2c81c23b0caa5f958df73d5727c028a Mon Sep 17 00:00:00 2001 From: Menno Date: Wed, 18 Feb 2026 13:37:41 +0100 Subject: [PATCH 7/9] switch_twostream no longer a const, it can be set later on --- src_test/test_rte_rrtmgp_rt.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src_test/test_rte_rrtmgp_rt.cu b/src_test/test_rte_rrtmgp_rt.cu index e95bb140..da8d25a4 100644 --- a/src_test/test_rte_rrtmgp_rt.cu +++ b/src_test/test_rte_rrtmgp_rt.cu @@ -257,7 +257,7 @@ void solve_radiation(int argc, char** argv) const bool switch_shortwave = command_line_switches.at("shortwave" ).first; const bool switch_longwave = command_line_switches.at("longwave" ).first; const bool switch_fluxes = command_line_switches.at("fluxes" ).first; - const bool switch_twostream = command_line_switches.at("two-stream" ).first; + bool switch_twostream = command_line_switches.at("two-stream" ).first; bool switch_sw_raytracing = command_line_switches.at("sw-raytracing" ).first; bool switch_lw_raytracing = command_line_switches.at("lw-raytracing" ).first; bool switch_independent_column= command_line_switches.at("independent-column").first; From cb4d61c7d28e69a3599441f34ec825e4e2f5ef5e Mon Sep 17 00:00:00 2001 From: Menno Date: Wed, 18 Feb 2026 16:42:50 +0100 Subject: [PATCH 8/9] combine surface, top-of-domain, and atmosphere aliastables into one: this speeds up the raytracer by about 30% and halves noise --- include_rt_kernels/raytracer_kernels_lw.h | 3 +- src_cuda_rt/raytracer_lw.cu | 179 ++++++++------------ src_kernels_cuda_rt/raytracer_kernels_lw.cu | 90 +++++----- src_test/test_rte_rrtmgp_rt.cu | 5 +- 4 files changed, 110 insertions(+), 167 deletions(-) diff --git a/include_rt_kernels/raytracer_kernels_lw.h b/include_rt_kernels/raytracer_kernels_lw.h index 442927e3..cc34dbc5 100644 --- a/include_rt_kernels/raytracer_kernels_lw.h +++ b/include_rt_kernels/raytracer_kernels_lw.h @@ -20,8 +20,7 @@ constexpr Float k_null_gas_min = Float(1.e-3); __global__ void ray_tracer_lw_kernel( - const Float rng_offset, - const int src_type, + const Int rng_offset, const bool independent_column, const Int photons_to_shoot, const double* __restrict__ alias_prob, diff --git a/src_cuda_rt/raytracer_lw.cu b/src_cuda_rt/raytracer_lw.cu index 155905b1..c4f90063 100644 --- a/src_cuda_rt/raytracer_lw.cu +++ b/src_cuda_rt/raytracer_lw.cu @@ -40,13 +40,11 @@ namespace __global__ void get_emitted_power( - const Vector grid_cells, const Vector grid_d, + const Vector grid_cells, const Float* __restrict__ tau_tot, const Float* __restrict__ ssa_tot, const Float* __restrict__ lay_source, const Float* __restrict__ sfc_source, const Float* __restrict__ emis_sfc, const Float tod_inc_flx, - Float* __restrict__ power_atm, - Float* __restrict__ power_sfc, - Float* __restrict__ power_tod) + Float* __restrict__ emitted_power) { const int ix = blockIdx.x*blockDim.x + threadIdx.x; const int iy = blockIdx.y*blockDim.y + threadIdx.y; @@ -54,19 +52,19 @@ namespace if ( (ix < grid_cells.x) && (iy < grid_cells.y) && (iz < (grid_cells.z)) ) { - // tau = k_ext * dz; hence k_ext*cell_volume = tau * cell_area - const int idx = ix + iy*grid_cells.x + iz*grid_cells.x*grid_cells.y; - power_atm[idx] = Float(4.) * M_PI * tau_tot[idx] * lay_source[idx] * (Float(1.) - ssa_tot[idx]); + const int idx_3d_in = ix + iy*grid_cells.x + iz*grid_cells.x*grid_cells.y; + const int idx_3d_out = idx_3d_in + grid_cells.x*grid_cells.y; + emitted_power[idx_3d_out] = Float(4.) * M_PI * tau_tot[idx_3d_in] * lay_source[idx_3d_in] * (Float(1.) - ssa_tot[idx_3d_in]); if (iz == 0) { const int idx_2d = ix + iy*grid_cells.x; - power_sfc[idx_2d] = M_PI * emis_sfc[idx_2d] * sfc_source[idx_2d]; + emitted_power[idx_2d] = M_PI * emis_sfc[idx_2d] * sfc_source[idx_2d]; } if (iz == grid_cells.z-1) { - const int idx_2d = ix + iy*grid_cells.x; - power_tod[idx_2d] = tod_inc_flx; + const int idx_2d = ix + iy*grid_cells.x + (iz+2)*grid_cells.x*grid_cells.y; + emitted_power[idx_2d] = tod_inc_flx; } } @@ -236,38 +234,28 @@ void Raytracer_lw::trace_rays( tau_aeros.ptr(), ssa_aeros.ptr(), asy_aeros.ptr(), k_ext.ptr(), scat_asy.ptr()); - Array_gpu power_atm({grid_cells.x, grid_cells.y, grid_cells.z}); - Array_gpu power_sfc({grid_cells.x, grid_cells.y}); - Array_gpu power_tod({grid_cells.x, grid_cells.y}); + Array_gpu emitted_power({grid_cells.x, grid_cells.y, grid_cells.z+2}); - const int grid_z_pwr = (grid_cells.z+2)/block_z + ((grid_cells.z+2)%block_z > 0); - dim3 grid_pwr(grid_col_x, grid_col_y, grid_z_pwr); - dim3 block_pwr(block_col_x, block_col_y, block_z); + const int block_pwr_x = 64; + const int grid_pwr_x = grid_cells.x/block_pwr_x + (grid_cells.x%block_pwr_x > 0); - get_emitted_power<<>>( - grid_cells, grid_d, + dim3 grid_pwr(grid_pwr_x, grid_cells.y, grid_cells.z+2); + dim3 block_pwr(block_pwr_x, 1, 1); + + get_emitted_power<<>>( + grid_cells, tau_total.ptr(), ssa_total.ptr(), lay_source.ptr(), sfc_source.ptr(), emis_sfc.ptr(), tod_inc_flx, - power_atm.ptr(), power_sfc.ptr(), power_tod.ptr()); + emitted_power.ptr()); // Build alias tables for emission source sampling. - const int n_atm = grid_cells.x*grid_cells.y*grid_cells.z; - const int n_2d = grid_cells.x*grid_cells.y; + const int n_alias = grid_cells.x*grid_cells.y*(grid_cells.z+2); - Array_gpu alias_prob_atm({n_atm}); - Array_gpu alias_idx_atm({n_atm}); + Array_gpu alias_prob({n_alias}); + Array_gpu alias_idx({n_alias}); - Array_gpu alias_prob_sfc({n_2d}); - Array_gpu alias_idx_sfc({n_2d}); - - Array_gpu alias_prob_tod({n_2d}); - Array_gpu alias_idx_tod({n_2d}); - - double total_power_atm, total_power_sfc, total_power_tod; - - build_alias_table(power_atm.ptr(), n_atm, alias_prob_atm.ptr(), alias_idx_atm.ptr(), total_power_atm); - build_alias_table(power_sfc.ptr(), n_2d, alias_prob_sfc.ptr(), alias_idx_sfc.ptr(), total_power_sfc); - build_alias_table(power_tod.ptr(), n_2d, alias_prob_tod.ptr(), alias_idx_tod.ptr(), total_power_tod); + double total_power; + build_alias_table(emitted_power.ptr(), n_alias, alias_prob.ptr(), alias_idx.ptr(), total_power); const int block_kn_x = 8; const int block_kn_y = 8; @@ -295,31 +283,15 @@ void Raytracer_lw::trace_rays( Array_gpu surface_up_count({grid_cells.x, grid_cells.y}); Array_gpu atmos_count({grid_cells.x, grid_cells.y, grid_cells.z}); + Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(grid_cells.x, grid_cells.y, tod_dn_count.ptr()); + Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(grid_cells.x, grid_cells.y, tod_up_count.ptr()); + Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(grid_cells.x, grid_cells.y, surface_dn_count.ptr()); + Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(grid_cells.x, grid_cells.y, surface_up_count.ptr()); + Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(grid_cells.x, grid_cells.y, grid_cells.z, atmos_count.ptr()); + // domain sizes const Vector grid_size = grid_d * grid_cells; - // // number of photons per thread, this should a power of 2 and nonzero - // Float photons_per_thread_tmp = std::max(Float(1), static_cast(photons_total) / (rt_kernel_grid * rt_kernel_block)); - // Int photons_per_thread = pow(Float(2.), std::floor(std::log2(photons_per_thread_tmp))); - - // // with very low number of columns and photons_per_pixel, we may have too many threads firing a single photons, actually exceeding photons_per pixel - // // In that case, reduce grid and block size - // Int actual_photons_per_pixel = photons_per_thread * rt_kernel_grid * rt_kernel_block / (qrng_grid_x * qrng_grid_y); - - // int rt_kernel_grid_size = rt_kernel_grid; - // int rt_kernel_block_size = rt_kernel_block; - // while ( (actual_photons_per_pixel > photons_per_pixel) ) - // { - // if (rt_kernel_grid_size > 1) - // rt_kernel_grid_size /= 2; - // else - // rt_kernel_block_size /= 2; - - // photons_per_thread_tmp = std::max(Float(1), static_cast(photons_total) / (rt_kernel_grid_size * rt_kernel_block_size)); - // photons_per_thread = pow(Float(2.), std::floor(std::log2(photons_per_thread_tmp))); - // actual_photons_per_pixel = photons_per_thread * rt_kernel_grid_size * rt_kernel_block_size / (qrng_grid_x * qrng_grid_y); - // } - Int photons_per_thread; Int rt_kernel_grid_size = rt_lw_kernel_grid; Int rt_kernel_block_size = rt_lw_kernel_block; @@ -343,63 +315,44 @@ void Raytracer_lw::trace_rays( dim3 grid(rt_kernel_grid_size); dim3 block(rt_kernel_block_size); - const Float rng_offset = igpt*rt_lw_kernel_grid*rt_lw_kernel_block; - - auto run_raytracer = [&]( - const int src_type, - const double* alias_prob, - const int* alias_idx, - const int n_table, - const double total_power_src) - { - Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(grid_cells.x, grid_cells.y, tod_dn_count.ptr()); - Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(grid_cells.x, grid_cells.y, tod_up_count.ptr()); - Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(grid_cells.x, grid_cells.y, surface_dn_count.ptr()); - Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(grid_cells.x, grid_cells.y, surface_up_count.ptr()); - Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(grid_cells.x, grid_cells.y, grid_cells.z, atmos_count.ptr()); - - ray_tracer_lw_kernel<<>>( - rng_offset, - src_type, - switch_independent_column, - photons_per_thread, - alias_prob, - alias_idx, - n_table, - k_null_grid.ptr(), - tod_dn_count.ptr(), - tod_up_count.ptr(), - surface_dn_count.ptr(), - surface_up_count.ptr(), - atmos_count.ptr(), - k_ext.ptr(), scat_asy.ptr(), - emis_sfc.ptr(), - grid_size, grid_d, grid_cells, kn_grid); - - const Float power_per_photon = Float(total_power_src / photons_per_thread * rt_lw_kernel_grid * rt_lw_kernel_block); - - count_to_flux_2d<<>>( - grid_cells, - power_per_photon, - tod_dn_count.ptr(), - tod_up_count.ptr(), - surface_dn_count.ptr(), - surface_up_count.ptr(), - flux_tod_dn.ptr(), - flux_tod_up.ptr(), - flux_sfc_dn.ptr(), - flux_sfc_up.ptr()); - - count_to_flux_3d<<>>( - grid_cells, grid_d, - power_per_photon, - atmos_count.ptr(), - flux_abs.ptr()); - }; - - run_raytracer(0, alias_prob_atm.ptr(), alias_idx_atm.ptr(), n_atm, total_power_atm); - run_raytracer(1, alias_prob_sfc.ptr(), alias_idx_sfc.ptr(), n_2d, total_power_sfc); - run_raytracer(2, alias_prob_tod.ptr(), alias_idx_tod.ptr(), n_2d, total_power_tod); + const Int rng_offset = igpt*rt_lw_kernel_grid*rt_lw_kernel_block; + + ray_tracer_lw_kernel<<>>( + rng_offset, + switch_independent_column, + photons_per_thread, + alias_prob.ptr(), + alias_idx.ptr(), + n_alias, + k_null_grid.ptr(), + tod_dn_count.ptr(), + tod_up_count.ptr(), + surface_dn_count.ptr(), + surface_up_count.ptr(), + atmos_count.ptr(), + k_ext.ptr(), scat_asy.ptr(), + emis_sfc.ptr(), + grid_size, grid_d, grid_cells, kn_grid); + + const Float power_per_photon = Float(total_power / (photons_per_thread * rt_lw_kernel_grid * rt_lw_kernel_block)); + + count_to_flux_2d<<>>( + grid_cells, + power_per_photon, + tod_dn_count.ptr(), + tod_up_count.ptr(), + surface_dn_count.ptr(), + surface_up_count.ptr(), + flux_tod_dn.ptr(), + flux_tod_up.ptr(), + flux_sfc_dn.ptr(), + flux_sfc_up.ptr()); + + count_to_flux_3d<<>>( + grid_cells, grid_d, + power_per_photon, + atmos_count.ptr(), + flux_abs.ptr()); } Raytracer_lw::Raytracer_lw() diff --git a/src_kernels_cuda_rt/raytracer_kernels_lw.cu b/src_kernels_cuda_rt/raytracer_kernels_lw.cu index 0c85decd..86457cc2 100644 --- a/src_kernels_cuda_rt/raytracer_kernels_lw.cu +++ b/src_kernels_cuda_rt/raytracer_kernels_lw.cu @@ -98,7 +98,7 @@ namespace __device__ inline void reset_photon( - Photon& photon, const int src_type, + Photon& photon, Int& photons_shot, const Int photons_to_shoot, const double* __restrict__ alias_prob, const int* __restrict__ alias_idx, @@ -112,7 +112,8 @@ namespace Float* __restrict__ const surface_up_count, Float* __restrict__ const atmos_count, Float& photon_weight, - Float& total_absorbed_weight) + Float& total_absorbed_weight, + int& src_type) { ++photons_shot; @@ -120,67 +121,56 @@ namespace { Float mu, azi; - if (src_type == 0) - { - const int idx = sample_alias_table( - alias_prob, alias_idx, alias_n, - alias_rng(), alias_rng()); + const int idx = sample_alias_table( + alias_prob, alias_idx, alias_n, + alias_rng(), alias_rng()); - const int i = (idx%(grid_cells.x * grid_cells.y)) % grid_cells.x ; - const int j = (idx%(grid_cells.x * grid_cells.y)) / grid_cells.x ; - const int k = idx / (grid_cells.x * grid_cells.y) ; + const int i = (idx%(grid_cells.x * grid_cells.y)) % grid_cells.x ; + const int j = (idx%(grid_cells.x * grid_cells.y)) / grid_cells.x ; + const int k = idx / (grid_cells.x * grid_cells.y) ; - const int ij = i + j * grid_cells.x; + const int ij = i + j * grid_cells.x; + + if (k == 0) // surface + { + src_type = 1; photon.position.x = (i + rng()) * grid_d.x; photon.position.y = (j + rng()) * grid_d.y; - photon.position.z = (k + rng()) * grid_d.z; + photon.position.z = Float(0.); - mu = rng()*Float(2.) - Float(1.); + mu = sqrt(rng()); azi = Float(2.*M_PI)*rng(); - const int ijk = ij + k*grid_cells.x*grid_cells.y; - photon.starting_idx = ijk; + photon.starting_idx = ij; } - if (src_type == 1) + else if (k == grid_cells.z+1) // top-of-domain { - const int idx = sample_alias_table( - alias_prob, alias_idx, alias_n, - alias_rng(), alias_rng()); - - const int i = idx % grid_cells.x ; - const int j = idx / grid_cells.x ; - - const int ij = i + j * grid_cells.x; - + src_type = 2; photon.position.x = (i + rng()) * grid_d.x; photon.position.y = (j + rng()) * grid_d.y; - photon.position.z = Float(0.); + photon.position.z = grid_size.z - Float_epsilon; - mu = sqrt(rng()); + mu = Float(-1.)*sqrt(rng()); azi = Float(2.*M_PI)*rng(); photon.starting_idx = ij; + } - if (src_type == 2) + else { - const int idx = sample_alias_table( - alias_prob, alias_idx, alias_n, - alias_rng(), alias_rng()); - - const int i = idx % grid_cells.x ; - const int j = idx / grid_cells.x ; - - const int ij = i + j * grid_cells.x; + src_type = 0; + const int km = k - 1; photon.position.x = (i + rng()) * grid_d.x; photon.position.y = (j + rng()) * grid_d.y; - photon.position.z = grid_size.z - Float_epsilon; + photon.position.z = (km + rng()) * grid_d.z; - mu = Float(-1.)*sqrt(rng()); + mu = rng()*Float(2.) - Float(1.); azi = Float(2.*M_PI)*rng(); - photon.starting_idx = ij; + const int ijk = ij + km*grid_cells.x*grid_cells.y; + photon.starting_idx = ijk; } const Float s = sqrt(Float(1.) - mu*mu + Float_epsilon); @@ -199,8 +189,7 @@ namespace __global__ void ray_tracer_lw_kernel( - const Float rng_offset, - const int src_type, + const Int rng_offset, const bool independent_column, const Int photons_to_shoot, const double* __restrict__ alias_prob, @@ -236,14 +225,14 @@ void ray_tracer_lw_kernel( Int photons_shot = Atomic_reduce_const; Float photon_weight; Float total_absorbed_weight; + int src_type; reset_photon( - photon, src_type, - photons_shot, photons_to_shoot, + photon, photons_shot, photons_to_shoot, alias_prob, alias_idx, alias_n, rng, alias_rng, grid_size, grid_d, grid_cells, toa_down_count, surface_up_count, atmos_count, - photon_weight, total_absorbed_weight); + photon_weight, total_absorbed_weight, src_type); Float tau = Float(0.); Float d_max = Float(0.); @@ -323,12 +312,11 @@ void ray_tracer_lw_kernel( write_emission(photon, src_type, total_absorbed_weight, toa_down_count, surface_up_count, atmos_count); reset_photon( - photon, src_type, - photons_shot, photons_to_shoot, + photon, photons_shot, photons_to_shoot, alias_prob, alias_idx, alias_n, rng, alias_rng, grid_size, grid_d, grid_cells, toa_down_count, surface_up_count, atmos_count, - photon_weight, total_absorbed_weight); + photon_weight, total_absorbed_weight, src_type); // Cycle the while loop, this photon is done. continue; @@ -368,12 +356,12 @@ void ray_tracer_lw_kernel( write_emission(photon, src_type, total_absorbed_weight, toa_down_count, surface_up_count, atmos_count); reset_photon( - photon, src_type, + photon, photons_shot, photons_to_shoot, alias_prob, alias_idx, alias_n, rng, alias_rng, grid_size, grid_d, grid_cells, toa_down_count, surface_up_count, atmos_count, - photon_weight, total_absorbed_weight); + photon_weight, total_absorbed_weight, src_type); // Cycle the while loop, this photon is done. continue; @@ -505,12 +493,12 @@ void ray_tracer_lw_kernel( write_emission(photon, src_type, total_absorbed_weight, toa_down_count, surface_up_count, atmos_count); reset_photon( - photon, src_type, + photon, photons_shot, photons_to_shoot, alias_prob, alias_idx, alias_n, rng, alias_rng, grid_size, grid_d, grid_cells, toa_down_count, surface_up_count, atmos_count, - photon_weight, total_absorbed_weight); + photon_weight, total_absorbed_weight, src_type); } } diff --git a/src_test/test_rte_rrtmgp_rt.cu b/src_test/test_rte_rrtmgp_rt.cu index f9926d4a..e868af5e 100644 --- a/src_test/test_rte_rrtmgp_rt.cu +++ b/src_test/test_rte_rrtmgp_rt.cu @@ -257,7 +257,7 @@ void solve_radiation(int argc, char** argv) const bool switch_shortwave = command_line_switches.at("shortwave" ).first; const bool switch_longwave = command_line_switches.at("longwave" ).first; const bool switch_fluxes = command_line_switches.at("fluxes" ).first; - const bool switch_twostream = command_line_switches.at("two-stream" ).first; + bool switch_twostream = command_line_switches.at("two-stream" ).first; bool switch_sw_raytracing = command_line_switches.at("sw-raytracing" ).first; bool switch_lw_raytracing = command_line_switches.at("lw-raytracing" ).first; bool switch_independent_column= command_line_switches.at("independent-column").first; @@ -276,7 +276,10 @@ void solve_radiation(int argc, char** argv) if (!switch_shortwave) + { switch_sw_raytracing = false; + switch_twostream = true; + } if (!switch_longwave) switch_lw_raytracing = false; From 565828442b20d99c722c103d45bbe9ed5f4b60b4 Mon Sep 17 00:00:00 2001 From: Menno Date: Wed, 18 Feb 2026 16:48:46 +0100 Subject: [PATCH 9/9] rename two-stream switch to sw-two-stream and default to false --- src_test/test_rte_rrtmgp_rt.cu | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src_test/test_rte_rrtmgp_rt.cu b/src_test/test_rte_rrtmgp_rt.cu index a55115ea..38527941 100644 --- a/src_test/test_rte_rrtmgp_rt.cu +++ b/src_test/test_rte_rrtmgp_rt.cu @@ -228,7 +228,7 @@ void solve_radiation(int argc, char** argv) {"shortwave" , { true, "Enable computation of shortwave radiation."}}, {"longwave" , { true, "Enable computation of longwave radiation." }}, {"fluxes" , { true, "Enable computation of fluxes." }}, - {"two-stream" , { true, "Run two-stream solver for to obtain 1D fluxes" }}, + {"sw-two-stream" , { false, "Run two-stream solver for to obtain 1D fluxes" }}, {"sw-raytracing" , { true, "Use shortwave raytracing for flux computation. '--sw-raytracing 256': use 256 rays per pixel per spectral quadrature point" }}, {"lw-raytracing" , { true, "Use longwave raytracing for flux computation. '--lw-raytracing 22': use a total of 2**22 rays per spectral quadrature point" }}, {"independent-column", { false, "run raytracer in independent column mode"}}, @@ -257,7 +257,7 @@ void solve_radiation(int argc, char** argv) const bool switch_shortwave = command_line_switches.at("shortwave" ).first; const bool switch_longwave = command_line_switches.at("longwave" ).first; const bool switch_fluxes = command_line_switches.at("fluxes" ).first; - bool switch_twostream = command_line_switches.at("two-stream" ).first; + bool switch_sw_twostream = command_line_switches.at("sw-two-stream" ).first; bool switch_sw_raytracing = command_line_switches.at("sw-raytracing" ).first; bool switch_lw_raytracing = command_line_switches.at("lw-raytracing" ).first; bool switch_independent_column= command_line_switches.at("independent-column").first; @@ -276,14 +276,12 @@ void solve_radiation(int argc, char** argv) if (!switch_shortwave) - { switch_sw_raytracing = false; - } if (!switch_longwave) switch_lw_raytracing = false; - if (switch_shortwave && !switch_twostream && !switch_sw_raytracing) + if (switch_shortwave && !switch_sw_twostream && !switch_sw_raytracing) { std::string error = "With shortwave enable, need to run either two-stream solver or ray tracer "; throw std::runtime_error(error); @@ -1009,7 +1007,7 @@ void solve_radiation(int argc, char** argv) if (switch_fluxes) { - if(switch_twostream) + if(switch_sw_twostream) { sw_flux_up .set_dims({n_col, n_lev}); sw_flux_dn .set_dims({n_col, n_lev}); @@ -1078,7 +1076,7 @@ void solve_radiation(int argc, char** argv) rad_sw.solve_gpu( switch_fluxes, - switch_twostream, + switch_sw_twostream, switch_sw_raytracing, switch_independent_column, switch_cloud_optics, @@ -1217,7 +1215,7 @@ void solve_radiation(int argc, char** argv) if (switch_fluxes) { - if (switch_twostream) + if (switch_sw_twostream) { auto nc_sw_flux_up = output_nc.add_variable("sw_flux_up" , {"lev", "y", "x"}); auto nc_sw_flux_dn = output_nc.add_variable("sw_flux_dn" , {"lev", "y", "x"}); @@ -1283,7 +1281,7 @@ void solve_radiation(int argc, char** argv) if (switch_single_gpt) { - if (switch_twostream) + if (switch_sw_twostream) { auto nc_sw_gpt_flux_up = output_nc.add_variable("sw_gpt_flux_up" , {"lev", "y", "x"}); auto nc_sw_gpt_flux_dn = output_nc.add_variable("sw_gpt_flux_dn" , {"lev", "y", "x"});