@@ -239,64 +239,18 @@ void ABLStats::compute_zi()
239239 .create_scratch_field (3 , m_temperature.num_grow ()[0 ]);
240240 fvm::gradient (*gradT, m_temperature);
241241
242- // // Only compute zi using coarsest level
243- // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_a", ab);
242+ // Only compute zi using coarsest level
244243 const int lev = 0 ;
245244 const int dir = m_normal_dir;
246245 const auto & geom = (this ->m_sim .repo ()).mesh ().Geom (lev);
247246 auto const & domain_box = geom.Domain ();
248- // const auto& gradT_arrs = (*gradT)(lev).const_arrays();
249- // auto device_tg_fab = amrex::ReduceToPlane<
250- // amrex::ReduceOpMax, amrex::KeyValuePair<amrex::Real, int>>(
251- // dir, domain_box, m_temperature(lev),
252- // [=] AMREX_GPU_DEVICE(int nbx, int i, int j, int k)
253- // -> amrex::KeyValuePair<amrex::Real, int> {
254- // const amrex::IntVect iv(i, j, k);
255- // return {gradT_arrs[nbx](i, j, k, dir), iv[dir]};
256- // });
257- // BL_PROFILE_VAR_STOP(ab);
258-
259- // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_b", bb);
260- // #ifdef AMREX_USE_GPU
261- // amrex::BaseFab<amrex::KeyValuePair<amrex::Real, int>> pinned_tg_fab(
262- // device_tg_fab.box(), device_tg_fab.nComp(), amrex::The_Pinned_Arena());
263- // amrex::Gpu::dtoh_memcpy(
264- // pinned_tg_fab.dataPtr(), device_tg_fab.dataPtr(),
265- // pinned_tg_fab.nBytes());
266- // #else
267- // auto& pinned_tg_fab = device_tg_fab;
268- // #endif
269- // BL_PROFILE_VAR_STOP(bb);
270-
271- // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_barrier", barrier);
272- // amrex::ParallelDescriptor::Barrier();
273- // BL_PROFILE_VAR_STOP(barrier);
274-
275- // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_c", cb);
276- // amrex::ParallelReduce::Max(
277- // pinned_tg_fab.dataPtr(), static_cast<int>(pinned_tg_fab.size()),
278- // amrex::ParallelDescriptor::IOProcessorNumber(),
279- // amrex::ParallelDescriptor::Communicator());
280- // BL_PROFILE_VAR_STOP(cb);
281-
282- // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_d", db);
283- // if (amrex::ParallelDescriptor::IOProcessor()) {
284- // const auto dnval = m_dn;
285- // auto* p = pinned_tg_fab.dataPtr();
286- // m_zi = amrex::Reduce::Sum<amrex::Real>(
287- // pinned_tg_fab.size(),
288- // [=] AMREX_GPU_DEVICE(int i) noexcept -> amrex::Real {
289- // return (p[i].second() + 0.5) * dnval;
290- // },
291- // 0.0);
292- // m_zi /= static_cast<amrex::Real>(pinned_tg_fab.size());
293- // }
294- // BL_PROFILE_VAR_STOP(db);
295-
296- AMREX_ALWAYS_ASSERT (domain_box.smallEnd () == 0 ); // We could relax this if necessary.
297- amrex::Array<bool ,AMREX_SPACEDIM> decomp{AMREX_D_DECL (true ,true ,true )};
247+
248+ AMREX_ALWAYS_ASSERT (
249+ domain_box.smallEnd () == 0 ); // We could relax this if necessary.
250+ amrex::Array<bool , AMREX_SPACEDIM> decomp{AMREX_D_DECL (true , true , true )};
298251 decomp[dir] = false ; // no domain decompose in the dir direction.
299- auto new_ba = amrex::decompose (domain_box, amrex::ParallelDescriptor::NProcs (), decomp);
252+ auto new_ba = amrex::decompose (
253+ domain_box, amrex::ParallelDescriptor::NProcs (), decomp);
300254
301255 amrex::Vector<int > pmap (new_ba.size ());
302256 std::iota (pmap.begin (), pmap.end (), 0 );
@@ -310,7 +264,8 @@ void ABLStats::compute_zi()
310264 if (myproc < new_mf.size ()) {
311265 auto const & a = new_mf.const_array (myproc);
312266 amrex::Box box2d = amrex::makeSlab (amrex::Box (a), dir, 0 );
313- AMREX_ALWAYS_ASSERT (dir == 2 ); // xxxxx TODO: we can support other directions later
267+ AMREX_ALWAYS_ASSERT (
268+ dir == 2 ); // xxxxx TODO: we can support other directions later
314269 // xxxxx TODO: sycl can be supported in the future.
315270 // xxxxx TODO: we can support CPU later.
316271 const int nblocks = box2d.numPts ();
@@ -321,35 +276,40 @@ void ABLStats::compute_zi()
321276 const int loy = box2d.smallEnd (1 );
322277 amrex::Gpu::DeviceVector<int > tmp (nblocks);
323278 auto * ptmp = tmp.data ();
324- amrex::launch<nthreads>(nblocks, amrex::Gpu::gpuStream (),
325- [=] AMREX_GPU_DEVICE ()
326- {
327- const int j = int (blockIdx.x ) / lenx + loy;
328- const int i = int (blockIdx.x ) - j*lenx +lox;
329- amrex::KeyValuePair<amrex::Real,int > r{std::numeric_limits<amrex::Real>::lowest (),0 };
330- for (int k = threadIdx.x ; k < lenz; k += nthreads) {
331- if (a (i,j,k) > r.first ()) { r.second () = k; r.first () = a (i,j,k);}
332- }
333- r = amrex::Gpu::blockReduceMax<nthreads>(r);
334- if (threadIdx.x == 0 ) {
335- ptmp[blockIdx.x ] = r.second ();
336- }
337- });
279+ amrex::launch<nthreads>(
280+ nblocks, amrex::Gpu::gpuStream (), [=] AMREX_GPU_DEVICE () {
281+ const int j = int (blockIdx.x ) / lenx + loy;
282+ const int i = int (blockIdx.x ) - j * lenx + lox;
283+ amrex::KeyValuePair<amrex::Real, int > r{
284+ std::numeric_limits<amrex::Real>::lowest (), 0 };
285+ for (int k = threadIdx.x ; k < lenz; k += nthreads) {
286+ if (a (i, j, k) > r.first ()) {
287+ r.second () = k;
288+ r.first () = a (i, j, k);
289+ }
290+ }
291+ r = amrex::Gpu::blockReduceMax<nthreads>(r);
292+ if (threadIdx.x == 0 ) {
293+ ptmp[blockIdx.x ] = r.second ();
294+ }
295+ });
338296
339297 const auto dnval = m_dn;
340- zi_sum = amrex::Reduce::Sum<amrex::Real>
341- (nblocks, [=] AMREX_GPU_DEVICE (int iblock)
342- {
343- return (ptmp[iblock] + amrex::Real (0.5 )) * dnval;
344- });
298+ zi_sum = amrex::Reduce::Sum<amrex::Real>(
299+ nblocks, [=] AMREX_GPU_DEVICE (int iblock) {
300+ return (ptmp[iblock] + amrex::Real (0.5 )) * dnval;
301+ });
345302 }
346303
347- amrex::ParallelReduce::Sum (zi_sum, amrex::ParallelDescriptor::IOProcessorNumber (),
348- amrex::ParallelDescriptor::Communicator ());
304+ amrex::ParallelReduce::Sum (
305+ zi_sum, amrex::ParallelDescriptor::IOProcessorNumber (),
306+ amrex::ParallelDescriptor::Communicator ());
349307
350308 amrex::Long npts = 1 ;
351309 for (int idim = 0 ; idim < AMREX_SPACEDIM; ++idim) {
352- if (idim != dir) { npts *= domain_box.length (idim); }
310+ if (idim != dir) {
311+ npts *= domain_box.length (idim);
312+ }
353313 }
354314 m_zi = zi_sum / static_cast <amrex::Real>(npts);
355315}
0 commit comments