@@ -237,64 +237,18 @@ void ABLStats::compute_zi()
237237 .create_scratch_field (3 , m_temperature.num_grow ()[0 ]);
238238 fvm::gradient (*gradT, m_temperature);
239239
240- // // Only compute zi using coarsest level
241- // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_a", ab);
240+ // Only compute zi using coarsest level
242241 const int lev = 0 ;
243242 const int dir = m_normal_dir;
244243 const auto & geom = (this ->m_sim .repo ()).mesh ().Geom (lev);
245244 auto const & domain_box = geom.Domain ();
246- // const auto& gradT_arrs = (*gradT)(lev).const_arrays();
247- // auto device_tg_fab = amrex::ReduceToPlane<
248- // amrex::ReduceOpMax, amrex::KeyValuePair<amrex::Real, int>>(
249- // dir, domain_box, m_temperature(lev),
250- // [=] AMREX_GPU_DEVICE(int nbx, int i, int j, int k)
251- // -> amrex::KeyValuePair<amrex::Real, int> {
252- // const amrex::IntVect iv(i, j, k);
253- // return {gradT_arrs[nbx](i, j, k, dir), iv[dir]};
254- // });
255- // BL_PROFILE_VAR_STOP(ab);
256-
257- // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_b", bb);
258- // #ifdef AMREX_USE_GPU
259- // amrex::BaseFab<amrex::KeyValuePair<amrex::Real, int>> pinned_tg_fab(
260- // device_tg_fab.box(), device_tg_fab.nComp(), amrex::The_Pinned_Arena());
261- // amrex::Gpu::dtoh_memcpy(
262- // pinned_tg_fab.dataPtr(), device_tg_fab.dataPtr(),
263- // pinned_tg_fab.nBytes());
264- // #else
265- // auto& pinned_tg_fab = device_tg_fab;
266- // #endif
267- // BL_PROFILE_VAR_STOP(bb);
268-
269- // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_barrier", barrier);
270- // amrex::ParallelDescriptor::Barrier();
271- // BL_PROFILE_VAR_STOP(barrier);
272-
273- // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_c", cb);
274- // amrex::ParallelReduce::Max(
275- // pinned_tg_fab.dataPtr(), static_cast<int>(pinned_tg_fab.size()),
276- // amrex::ParallelDescriptor::IOProcessorNumber(),
277- // amrex::ParallelDescriptor::Communicator());
278- // BL_PROFILE_VAR_STOP(cb);
279-
280- // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_d", db);
281- // if (amrex::ParallelDescriptor::IOProcessor()) {
282- // const auto dnval = m_dn;
283- // auto* p = pinned_tg_fab.dataPtr();
284- // m_zi = amrex::Reduce::Sum<amrex::Real>(
285- // pinned_tg_fab.size(),
286- // [=] AMREX_GPU_DEVICE(int i) noexcept -> amrex::Real {
287- // return (p[i].second() + 0.5) * dnval;
288- // },
289- // 0.0);
290- // m_zi /= static_cast<amrex::Real>(pinned_tg_fab.size());
291- // }
292- // BL_PROFILE_VAR_STOP(db);
293-
294- AMREX_ALWAYS_ASSERT (domain_box.smallEnd () == 0 ); // We could relax this if necessary.
295- amrex::Array<bool ,AMREX_SPACEDIM> decomp{AMREX_D_DECL (true ,true ,true )};
245+
246+ AMREX_ALWAYS_ASSERT (
247+ domain_box.smallEnd () == 0 ); // We could relax this if necessary.
248+ amrex::Array<bool , AMREX_SPACEDIM> decomp{AMREX_D_DECL (true , true , true )};
296249 decomp[dir] = false ; // no domain decompose in the dir direction.
297- auto new_ba = amrex::decompose (domain_box, amrex::ParallelDescriptor::NProcs (), decomp);
250+ auto new_ba = amrex::decompose (
251+ domain_box, amrex::ParallelDescriptor::NProcs (), decomp);
298252
299253 amrex::Vector<int > pmap (new_ba.size ());
300254 std::iota (pmap.begin (), pmap.end (), 0 );
@@ -308,7 +262,8 @@ void ABLStats::compute_zi()
308262 if (myproc < new_mf.size ()) {
309263 auto const & a = new_mf.const_array (myproc);
310264 amrex::Box box2d = amrex::makeSlab (amrex::Box (a), dir, 0 );
311- AMREX_ALWAYS_ASSERT (dir == 2 ); // xxxxx TODO: we can support other directions later
265+ AMREX_ALWAYS_ASSERT (
266+ dir == 2 ); // xxxxx TODO: we can support other directions later
312267 // xxxxx TODO: sycl can be supported in the future.
313268 // xxxxx TODO: we can support CPU later.
314269 const int nblocks = box2d.numPts ();
@@ -319,35 +274,40 @@ void ABLStats::compute_zi()
319274 const int loy = box2d.smallEnd (1 );
320275 amrex::Gpu::DeviceVector<int > tmp (nblocks);
321276 auto * ptmp = tmp.data ();
322- amrex::launch<nthreads>(nblocks, amrex::Gpu::gpuStream (),
323- [=] AMREX_GPU_DEVICE ()
324- {
325- const int j = int (blockIdx.x ) / lenx + loy;
326- const int i = int (blockIdx.x ) - j*lenx +lox;
327- amrex::KeyValuePair<amrex::Real,int > r{std::numeric_limits<amrex::Real>::lowest (),0 };
328- for (int k = threadIdx.x ; k < lenz; k += nthreads) {
329- if (a (i,j,k) > r.first ()) { r.second () = k; r.first () = a (i,j,k);}
330- }
331- r = amrex::Gpu::blockReduceMax<nthreads>(r);
332- if (threadIdx.x == 0 ) {
333- ptmp[blockIdx.x ] = r.second ();
334- }
335- });
277+ amrex::launch<nthreads>(
278+ nblocks, amrex::Gpu::gpuStream (), [=] AMREX_GPU_DEVICE () {
279+ const int j = int (blockIdx.x ) / lenx + loy;
280+ const int i = int (blockIdx.x ) - j * lenx + lox;
281+ amrex::KeyValuePair<amrex::Real, int > r{
282+ std::numeric_limits<amrex::Real>::lowest (), 0 };
283+ for (int k = threadIdx.x ; k < lenz; k += nthreads) {
284+ if (a (i, j, k) > r.first ()) {
285+ r.second () = k;
286+ r.first () = a (i, j, k);
287+ }
288+ }
289+ r = amrex::Gpu::blockReduceMax<nthreads>(r);
290+ if (threadIdx.x == 0 ) {
291+ ptmp[blockIdx.x ] = r.second ();
292+ }
293+ });
336294
337295 const auto dnval = m_dn;
338- zi_sum = amrex::Reduce::Sum<amrex::Real>
339- (nblocks, [=] AMREX_GPU_DEVICE (int iblock)
340- {
341- return (ptmp[iblock] + amrex::Real (0.5 )) * dnval;
342- });
296+ zi_sum = amrex::Reduce::Sum<amrex::Real>(
297+ nblocks, [=] AMREX_GPU_DEVICE (int iblock) {
298+ return (ptmp[iblock] + amrex::Real (0.5 )) * dnval;
299+ });
343300 }
344301
345- amrex::ParallelReduce::Sum (zi_sum, amrex::ParallelDescriptor::IOProcessorNumber (),
346- amrex::ParallelDescriptor::Communicator ());
302+ amrex::ParallelReduce::Sum (
303+ zi_sum, amrex::ParallelDescriptor::IOProcessorNumber (),
304+ amrex::ParallelDescriptor::Communicator ());
347305
348306 amrex::Long npts = 1 ;
349307 for (int idim = 0 ; idim < AMREX_SPACEDIM; ++idim) {
350- if (idim != dir) { npts *= domain_box.length (idim); }
308+ if (idim != dir) {
309+ npts *= domain_box.length (idim);
310+ }
351311 }
352312 m_zi = zi_sum / static_cast <amrex::Real>(npts);
353313}
0 commit comments