@@ -237,47 +237,119 @@ void ABLStats::compute_zi()
237237 .create_scratch_field (3 , m_temperature.num_grow ()[0 ]);
238238 fvm::gradient (*gradT, m_temperature);
239239
240- // Only compute zi using coarsest level
240+ // // Only compute zi using coarsest level
241+ // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_a", ab);
241242 const int lev = 0 ;
242243 const int dir = m_normal_dir;
243244 const auto & geom = (this ->m_sim .repo ()).mesh ().Geom (lev);
244245 auto const & domain_box = geom.Domain ();
245- const auto & gradT_arrs = (*gradT)(lev).const_arrays ();
246- auto device_tg_fab = amrex::ReduceToPlane<
247- amrex::ReduceOpMax, amrex::KeyValuePair<amrex::Real, int >>(
248- dir, domain_box, m_temperature (lev),
249- [=] AMREX_GPU_DEVICE (int nbx, int i, int j, int k)
250- -> amrex::KeyValuePair<amrex::Real, int > {
251- const amrex::IntVect iv (i, j, k);
252- return {gradT_arrs[nbx](i, j, k, dir), iv[dir]};
246+ // const auto& gradT_arrs = (*gradT)(lev).const_arrays();
247+ // auto device_tg_fab = amrex::ReduceToPlane<
248+ // amrex::ReduceOpMax, amrex::KeyValuePair<amrex::Real, int>>(
249+ // dir, domain_box, m_temperature(lev),
250+ // [=] AMREX_GPU_DEVICE(int nbx, int i, int j, int k)
251+ // -> amrex::KeyValuePair<amrex::Real, int> {
252+ // const amrex::IntVect iv(i, j, k);
253+ // return {gradT_arrs[nbx](i, j, k, dir), iv[dir]};
254+ // });
255+ // BL_PROFILE_VAR_STOP(ab);
256+
257+ // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_b", bb);
258+ // #ifdef AMREX_USE_GPU
259+ // amrex::BaseFab<amrex::KeyValuePair<amrex::Real, int>> pinned_tg_fab(
260+ // device_tg_fab.box(), device_tg_fab.nComp(), amrex::The_Pinned_Arena());
261+ // amrex::Gpu::dtoh_memcpy(
262+ // pinned_tg_fab.dataPtr(), device_tg_fab.dataPtr(),
263+ // pinned_tg_fab.nBytes());
264+ // #else
265+ // auto& pinned_tg_fab = device_tg_fab;
266+ // #endif
267+ // BL_PROFILE_VAR_STOP(bb);
268+
269+ // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_barrier", barrier);
270+ // amrex::ParallelDescriptor::Barrier();
271+ // BL_PROFILE_VAR_STOP(barrier);
272+
273+ // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_c", cb);
274+ // amrex::ParallelReduce::Max(
275+ // pinned_tg_fab.dataPtr(), static_cast<int>(pinned_tg_fab.size()),
276+ // amrex::ParallelDescriptor::IOProcessorNumber(),
277+ // amrex::ParallelDescriptor::Communicator());
278+ // BL_PROFILE_VAR_STOP(cb);
279+
280+ // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_d", db);
281+ // if (amrex::ParallelDescriptor::IOProcessor()) {
282+ // const auto dnval = m_dn;
283+ // auto* p = pinned_tg_fab.dataPtr();
284+ // m_zi = amrex::Reduce::Sum<amrex::Real>(
285+ // pinned_tg_fab.size(),
286+ // [=] AMREX_GPU_DEVICE(int i) noexcept -> amrex::Real {
287+ // return (p[i].second() + 0.5) * dnval;
288+ // },
289+ // 0.0);
290+ // m_zi /= static_cast<amrex::Real>(pinned_tg_fab.size());
291+ // }
292+ // BL_PROFILE_VAR_STOP(db);
293+
294+ AMREX_ALWAYS_ASSERT (domain_box.smallEnd () == 0 ); // We could relax this if necessary.
295+ amrex::Array<bool ,AMREX_SPACEDIM> decomp{AMREX_D_DECL (true ,true ,true )};
296+ decomp[dir] = false ; // no domain decompose in the dir direction.
297+ auto new_ba = amrex::decompose (domain_box, amrex::ParallelDescriptor::NProcs (), decomp);
298+
299+ amrex::Vector<int > pmap (new_ba.size ());
300+ std::iota (pmap.begin (), pmap.end (), 0 );
301+ amrex::DistributionMapping new_dm (std::move (pmap));
302+
303+ amrex::MultiFab new_mf (new_ba, new_dm, 1 , 0 );
304+ new_mf.ParallelCopy ((*gradT)(lev), dir, 0 , 1 );
305+
306+ amrex::Real zi_sum = 0 ;
307+ int myproc = amrex::ParallelDescriptor::MyProc ();
308+ if (myproc < new_mf.size ()) {
309+ auto const & a = new_mf.const_array (myproc);
310+ amrex::Box box2d = amrex::makeSlab (amrex::Box (a), dir, 0 );
311+ AMREX_ALWAYS_ASSERT (dir == 2 ); // xxxxx TODO: we can support other directions later
312+ // xxxxx TODO: sycl can be supported in the future.
313+ // xxxxx TODO: we can support CPU later.
314+ const int nblocks = box2d.numPts ();
315+ constexpr int nthreads = 128 ;
316+ const int lenx = box2d.length (0 );
317+ const int lenz = domain_box.length (2 );
318+ const int lox = box2d.smallEnd (0 );
319+ const int loy = box2d.smallEnd (1 );
320+ amrex::Gpu::DeviceVector<int > tmp (nblocks);
321+ auto * ptmp = tmp.data ();
322+ amrex::launch<nthreads>(nblocks, amrex::Gpu::gpuStream (),
323+ [=] AMREX_GPU_DEVICE ()
324+ {
325+ const int j = int (blockIdx.x ) / lenx + loy;
326+ const int i = int (blockIdx.x ) - j*lenx +lox;
327+ amrex::KeyValuePair<amrex::Real,int > r{std::numeric_limits<amrex::Real>::lowest (),0 };
328+ for (int k = threadIdx.x ; k < lenz; k += nthreads) {
329+ if (a (i,j,k) > r.first ()) { r.second () = k; r.first () = a (i,j,k);}
330+ }
331+ r = amrex::Gpu::blockReduceMax<nthreads>(r);
332+ if (threadIdx.x == 0 ) {
333+ ptmp[blockIdx.x ] = r.second ();
334+ }
253335 });
254336
255- #ifdef AMREX_USE_GPU
256- amrex::BaseFab<amrex::KeyValuePair<amrex::Real, int >> pinned_tg_fab (
257- device_tg_fab.box (), device_tg_fab.nComp (), amrex::The_Pinned_Arena ());
258- amrex::Gpu::dtoh_memcpy (
259- pinned_tg_fab.dataPtr (), device_tg_fab.dataPtr (),
260- pinned_tg_fab.nBytes ());
261- #else
262- auto & pinned_tg_fab = device_tg_fab;
263- #endif
337+ const auto dnval = m_dn;
338+ zi_sum = amrex::Reduce::Sum<amrex::Real>
339+ (nblocks, [=] AMREX_GPU_DEVICE (int iblock)
340+ {
341+ return (ptmp[iblock] + amrex::Real (0.5 )) * dnval;
342+ });
343+ }
264344
265- amrex::ParallelReduce::Max (
266- pinned_tg_fab.dataPtr (), static_cast <int >(pinned_tg_fab.size ()),
267- amrex::ParallelDescriptor::IOProcessorNumber (),
268- amrex::ParallelDescriptor::Communicator ());
345+ amrex::ParallelReduce::Sum (zi_sum, amrex::ParallelDescriptor::IOProcessorNumber (),
346+ amrex::ParallelDescriptor::Communicator ());
269347
270- if (amrex::ParallelDescriptor::IOProcessor ()) {
271- const auto dnval = m_dn;
272- auto * p = pinned_tg_fab.dataPtr ();
273- m_zi = amrex::Reduce::Sum<amrex::Real>(
274- pinned_tg_fab.size (),
275- [=] AMREX_GPU_DEVICE (int i) noexcept -> amrex::Real {
276- return (p[i].second () + 0.5 ) * dnval;
277- },
278- 0.0 );
279- m_zi /= static_cast <amrex::Real>(pinned_tg_fab.size ());
348+ amrex::Long npts = 1 ;
349+ for (int idim = 0 ; idim < AMREX_SPACEDIM; ++idim) {
350+ if (idim != dir) { npts *= domain_box.length (idim); }
280351 }
352+ m_zi = zi_sum / static_cast <amrex::Real>(npts);
281353}
282354
283355void ABLStats::process_output ()
0 commit comments