diff --git a/.gitignore b/.gitignore index 721f3a3f6..9bd1c40a0 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ main_spec.log main_spec.out main_spec.pdf main_spec.toc +*.log +*~ \ No newline at end of file diff --git a/content/backmatter.tex b/content/backmatter.tex index dbd707baa..d5098e3a2 100644 --- a/content/backmatter.tex +++ b/content/backmatter.tex @@ -184,7 +184,9 @@ \chapter{Undefined Behavior in OpenSHMEM}\label{sec:undefined} \end{longtable} - +\color{ForestGreen} +\input{content/interoperability} +\color{black} \chapter{History of OpenSHMEM}\label{sec:openshmem_history} @@ -329,6 +331,22 @@ \section{Overview}\label{subsec:dep_overview} \\ \CorCpp: \FuncRef{shmem\_\FuncParam{TYPENAME}\_add}} & 1.4 & Current & \hyperref[subsec:shmem_atomic_add]{\FUNC{shmem\_atomic\_add}} \\ \hline Entire \Fortran API & 1.4 & Current & (none) \\ \hline + \CorCpp: \FuncRef{shmem\_barrier} & 1.5 & Current & + \hyperref[subsec:shmem_quiet]{\FUNC{shmem\_quiet}}; \hyperref[subsec:shmem_sync]{\FUNC{shmem\_sync}} \\ \hline + \CorCpp: Active set based \FuncRef{shmem\_sync} & 1.5 & Current & + Team based \hyperref[subsec:shmem_sync]{\FUNC{shmem\_sync}} \\ \hline + \CorCpp: \FuncRef{shmem\_broadcast[32,64]} & 1.5 & Current & + \hyperref[subsec:shmem_broadcast]{\FUNC{shmem\_broadcast}} \\ \hline + \CorCpp: \FuncRef{shmem\_collect[32,64]} & 1.5 & Current & + \hyperref[subsec:shmem_collect]{\FUNC{shmem\_collect}} \\ \hline + \CorCpp: \FuncRef{shmem\_fcollect[32,64]} & 1.5 & Current & + \hyperref[subsec:shmem_collect]{\FUNC{shmem\_fcollect}} \\ \hline + \CorCpp: \FuncRef{shmem\_\TYPENAME\_OP\_to\_all} & 1.5 & Current & + \hyperref[subsec:shmem_collect]{\FUNC{shmem\_\TYPENAME\_OP\_reduce}} \\ \hline + \CorCpp: \FuncRef{shmem\_alltoall[32,64]} & 1.5 & Current & + \hyperref[subsec:shmem_alltoall]{\FUNC{shmem\_alltoall}} \\ \hline + \CorCpp: \FuncRef{shmem\_alltoalls[32,64]} & 1.5 & Current & + \hyperref[subsec:shmem_alltoalls]{\FUNC{shmem\_alltoalls}} \\ \hline \end{longtable} \end{center} @@ -471,8 +489,30 @@ \subsection{\textit{Fortran} API}\label{subsec:deprecate-fortran} %% WARNING: Is \footnote{Formally, \Fortran[2003] is known as ISO/IEC~1539-1:2004(E).}. - - +\subsection{Active-set-based collective routines} +With the addition of \openshmem teams, the previous methods for performing collective +operations has been superseded by a more readable, flexible method for +organizing and communicating between groups of \acp{PE}. All collective routines +which previously indicated subgroups of \acp{PE} with a list of +parameters to describe the subgroup composition should be phased +out in favor of using collective operations with a team parameter. + +When moving from active set routines to teams based routines, the fixed-size +versions of the routines, e.g. \FUNC{shmem\_broadcast32}, were not +carried forward. Instead, all teams based collective routines use standard +\Cstd types with the option to use generic \textit{C11} functions for more portable +and maintainable implementations. + +\subsection{\CorCpp: \FUNC{shmem\_barrier}} +Each \openshmem team might +be associated with some number of communication contexts. The \FUNC{shmem\_barrier} +functions imply that the default context is quiesced after synchronizing +some set of \acp{PE}. Since teams may have some number of contexts associated +with the team, it becomes less clear which context would be the ``default'' context +for that particular team. Rather than continue to support \FUNC{shmem\_barrier} +for active-sets or teams, programs should use a call to \FUNC{shmem\_quiet} +followed by a call to \FUNC{shmem\_sync} in order to explicitly +indicate which context to quiesce. \chapter{Changes to this Document}\label{sec:changelog} @@ -482,13 +522,31 @@ \section{Version 1.5} The following list describes the specific changes in \openshmem[1.5]: \begin{itemize} % +\item Added support for nonblocking \ac{AMO} functions. +\\ See Section \ref{sec:amo-nbi}. +% +\item Added support for blocking \OPR{put-with-signal} functions. +\\ See Section \ref{subsec:shmem_put_signal}. +% +\item Added support for nonblocking \OPR{put-with-signal} functions. +\\ See Section \ref{subsec:shmem_put_signal_nbi}. +% +\item Clarified that point-to-point synchronization routines preserve the + atomicity of OpenSHMEM \acp{AMO}. +\\ See Section~\ref{subsec:amo_guarantees}. +% +\item Clarified that symmetric variables used as \VAR{ivar} arguments to + point-to-point synchronization routines must be updated using OpenSHMEM + \acp{AMO}. +\\ See Section~\ref{subsec:p2p_intro}. +% \item Removed the entire \openshmem \Fortran API. % \item Added support for multipliers in \VAR{SHMEM\_SYMMETRIC\_SIZE} environment variables. \\ See Section \ref{subsec:environment_variables}. % -\item Added a new multiple-element point-to-point synchronization API with +\item Added support for a multiple-element point-to-point synchronization API with the functions: \FUNC{shmem\_wait\_until\_all}, \FUNC{shmem\_wait\_until\_any}, \FUNC{shmem\_wait\_until\_some}, \FUNC{shmem\_test\_all}, \FUNC{shmem\_test\_any}, and \FUNC{shmem\_test\_some}. @@ -497,6 +555,17 @@ \section{Version 1.5} \ref{subsec:shmem_test_all}, \ref{subsec:shmem_test_any}, and \ref{subsec:shmem_test_some}. % +\item Added support for vectorized comparison values in the multiple-element + point-to-point synchronization API with the functions: + \FUNC{shmem\_wait\_until\_all\_vector}, \FUNC{shmem\_wait\_until\_any\_vector}, + \FUNC{shmem\_wait\_until\_some\_vector}, \\ + \FUNC{shmem\_test\_all\_vector}, \FUNC{shmem\_test\_any\_vector}, and + \FUNC{shmem\_test\_some\_vector}. + \\See Sections \ref{subsec:shmem_wait_until_all_vector}, + \ref{subsec:shmem_wait_until_any_vector}, \ref{subsec:shmem_wait_until_some_vector}, + \ref{subsec:shmem_test_all_vector}, \ref{subsec:shmem_test_any_vector}, and + \ref{subsec:shmem_test_some_vector}. +% \item Added \openshmem profiling interface. \\ See Section~\ref{sec:openshmem_profiling_interface}. % @@ -898,4 +967,4 @@ \section{Version 1.1} % \end{itemize} -} %end of setlength command that was started in frontmatter.tex +%end of setlength command that was started in frontmatter.tex diff --git a/content/collective_intro.tex b/content/collective_intro.tex index 3e15b888f..26db0b5f6 100644 --- a/content/collective_intro.tex +++ b/content/collective_intro.tex @@ -1,49 +1,128 @@ -\emph{Collective routines} are defined as communication or synchronization -operations on a group of \acp{PE} called an active set. The collective -routines require all \acp{PE} in the active set to simultaneously call the +\emph{Collective routines} are defined as coordinated communication or synchronization +operations performed by a group of \acp{PE}. + +\openshmem provides three types of collective routines: + +\begin{enumerate} +\item Collective routines that operate on teams use a team handle parameter to determine + which \acp{PE} will participate in the routine, and use resources encapsulated by the team object + to perform operations. See Section~\ref{subsec:team} for details on team management. + +\begin{DeprecateBlock} +\item Collective routines that operate on active sets use a set of parameters to determine + which \acp{PE} will participate and what resources are used to perform operations. +\end{DeprecateBlock} + +\item Collective routines that accept neither team nor active set + parameters, which implicitly operate on the default team and, as + required, the default context. +\end{enumerate} + +\subsubsection*{Team-based collectives} + +The team-based collective routines are performed with respect to a valid +\openshmem team, which is specified by a team handle argument. +Team-based collective operations require all \acp{PE} in the team to call +the routine in order for the operation to complete. If an invalid team handle +or \LibConstRef{SHMEM\_TEAM\_INVALID} is passed to a team-based collective +routine, the behavior is undefined. + +Team objects encapsulate the per \ac{PE} system resources required to complete +team-based collective routines. +All \openshmem teams-based collective calls are blocking routines which may use those +system resources. On completion of a team-based collective call, the \ac{PE} may +immediately call another collective on that same team without any other intervening +synchronization across the team. + +While \openshmem routines provide thread support according to the +thread-support level provided at initialization (see +Section~\ref{subsec:thread_support}), team-based collective routines +may not be called simultaneously by multiple threads on a given team. + +Collective operations are matched across a given team based on ordering. So for a given team, +collectives must occur in the same order across all PEs in a team. + +The team-based collective routines defined in the \openshmem Specification are: + +\begin{itemize} +\item \FUNC{shmem\_team\_sync} +\item \FUNC{shmem\_\{TYPE\_\}broadcast\{mem\}} +\item \FUNC{shmem\_\{TYPE\_\}collect\{mem\}} +\item \FUNC{shmem\_\{TYPE\_\}fcollect\{mem\}} +\item Reduction routines for the following operations: AND, OR, XOR, MAX, MIN, SUM, PROD +\item \FUNC{shmem\_\{TYPE\_\}alltoall\{mem\}} +\item \FUNC{shmem\_\{TYPE\_\}alltoalls\{mem\}} +\end{itemize} + +In addition, all team creation functions are collective operations. In addition to the ordering +and thread safety requirements described here, there are additional synchronization requirements +on team creation operations. See Section~\ref{subsec:team} for more details. + +\begin{DeprecateBlock} + +\subsubsection*{Active-set-based collectives} + +The active-set-based collective routines require all \acp{PE} +in the active set to simultaneously call the routine. A \ac{PE} that is not in the active set calling the collective -routine results in undefined behavior. All collective routines have an -active set as an input parameter except \FUNC{shmem\_barrier\_all} and -\FUNC{shmem\_sync\_all}. Both \FUNC{shmem\_barrier\_all} and -\FUNC{shmem\_sync\_all} must be called by all \acp{PE} of the \openshmem program. +routine results in undefined behavior. The active set is defined by the arguments \VAR{PE\_start}, \VAR{logPE\_stride}, and \VAR{PE\_size}. \VAR{PE\_start} specifies the starting \ac{PE} number and -is the lowest numbered PE in the active set. The stride between successive +is the lowest numbered \ac{PE} in the active set. The stride between successive \acp{PE} in the active set is $2^{logPE\_stride}$ and \VAR{logPE\_stride} must be greater than or equal to zero. \VAR{PE\_size} specifies the number of \acp{PE} in the active set and must be greater than zero. The active set must satisfy the requirement that its last member corresponds to a valid \ac{PE} number, that is $0 \le PE\_start + (PE\_size - 1) * 2^{logPE\_stride} < npes$. -All \acp{PE} participating in the collective routine must provide the same + +All \acp{PE} participating in the active-set-based collective routine must provide the same values for these arguments. If any of these requirements are not met, the behavior is undefined. -Another argument important to collective routines is \VAR{pSync}, which is a -symmetric work array. All \acp{PE} participating in a collective must pass the -same \VAR{pSync} array. On completion of a collective call, the \VAR{pSync} is +Another argument important to active-set-based collective routines is \VAR{pSync}, which is a +symmetric work array. All \acp{PE} participating in an active-set-based collective must pass the +same \VAR{pSync} array. On completion of such a collective call, the \VAR{pSync} is restored to its original contents. The user is permitted to reuse a \VAR{pSync} array if all previous collective routines using the \VAR{pSync} array have been -completed by all participating \acp{PE}. One can use a synchronization -collective routine such as \FUNC{shmem\_barrier} to ensure completion of previous collective +completed by all participating \acp{PE}. One can use a synchronization +collective routine such as \FUNC{shmem\_barrier} to ensure completion of previous active-set-based collective routines. The \FUNC{shmem\_barrier} and \FUNC{shmem\_sync} routines allow the same \VAR{pSync} array to be used on consecutive calls as long as the \acp{PE} in the active set do not change. All collective routines defined in the Specification are blocking. The -collective routines return on completion. The collective routines defined in -the \openshmem Specification are: +collective routines return on completion. The active-set-based collective +routines defined in the \openshmem Specification are: \begin{itemize} -\item \FUNC{shmem\_barrier\_all} \item \FUNC{shmem\_barrier} -\item \FUNC{shmem\_sync\_all} \item \FUNC{shmem\_sync} \item \FUNC{shmem\_broadcast\{32, 64\}} \item \FUNC{shmem\_collect\{32, 64\}} \item \FUNC{shmem\_fcollect\{32, 64\}} -\item Reductions for the following operations: AND, MAX, MIN, SUM, PROD, OR, XOR +\item Reduction routines for the following operations: AND, MAX, MIN, SUM, PROD, OR, XOR \item \FUNC{shmem\_alltoall\{32, 64\}} \item \FUNC{shmem\_alltoalls\{32, 64\}} \end{itemize} + +\end{DeprecateBlock} + + +\subsubsection*{Team-implicit collectives} + +The \FUNC{shmem\_sync\_all} routine synchronizes all \acp{PE} in the +computation through the default team. This routine is equivalent to a +call to \FUNC{shmem\_team\_sync} on the default team. + +The \FUNC{shmem\_barrier\_all} routine synchronizes all \acp{PE} in +the default team and ensures completion of all local and remote memory +updates issued via the default context. This routine is equivalent to +a call to \FUNC{shmem\_ctx\_quiet} on the default context followed by a +call to \FUNC{shmem\_team\_sync} on the default team. + +\subsubsection*{Error codes returned from collectives} +\CorCpp routines that return an integer error code follow the convention that \CONST{0} indicates successful local completion of the operation. This is considered a best effort of the implementation to indicate that all required local operations have been performed correctly inside the routine and the internal \openshmem state on the calling \ac{PE} is consistent with the description of the routine and its arguments upon completion. Implementations may use an integer return value from a routine to define integer error codes specific to the implementation as long as those codes are not already explicitly defined for that routine by the \openshmem specification. + +Collective operations involving many \acp{PE} may return values indicating success while other \acp{PE} are still executing the collective operation. Return values indicating success of a collective routine on one \ac{PE} do not indicate that all \acp{PE} involved in the collective operation will return from the routine successfully. In the case where successful local completion of a collective implies the success of some global operation, such as team creation resulting in a valid team on all involved \acp{PE}, the implementation should not return \CONST{0} from the routine if the implied or stated global guarantees of the routine are not met. diff --git a/content/coverpage.tex b/content/coverpage.tex index d7692e386..15ca21dbe 100644 --- a/content/coverpage.tex +++ b/content/coverpage.tex @@ -59,6 +59,7 @@ \section*{Current Authors and Collaborators} \item Mike Dubman, Mellanox \item Karl Feind, \ac{HPE} \item Manjunath Gorentla Venkata, \ac{ORNL} +\item Megan Grodowitz, Arm Inc. \item Max Grossman, Rice University \item Khaled Hamidouche, \ac{AMD} \item Jeff Hammond, Intel @@ -78,7 +79,7 @@ \section*{Current Authors and Collaborators} \item Naveen Ravichandrasekaran, Cray Inc. \item Michael Raymond, \ac{HPE} \item James Ross, \ac{ARL} -\item Pavel Shamis, ARM Inc. +\item Pavel Shamis, Arm Inc. \item Sameer Shende, \ac{UO} \item Lauren Smith, \ac{DoD} diff --git a/content/interoperability.tex b/content/interoperability.tex new file mode 100644 index 000000000..1ce88945d --- /dev/null +++ b/content/interoperability.tex @@ -0,0 +1,191 @@ +\chapter{Interoperability with Other Programming Models}\label{sec:interoperability} + +OpenSHMEM routines may be used in conjunction with the routines of other +communication libraries or parallel languages in the same program. This section +describes the interoperability with other programming models, including +clarification of undefined behaviors caused by mixed use of different models, +advice to \openshmem library users and developers that may improve the portability +and performance of hybrid programs, and definition of an OpenSHMEM +API that queries the interoperability features provided by an \openshmem library. + + +\section{MPI Interoperability} + +\openshmem and MPI are two commonly used parallel programming models for +distributed-memory systems. The user can choose to utilize both models in the same program +to efficiently and easily support various communication patterns. + +A vendor may implement the \openshmem and MPI libraries in different ways. For +instance, one may implement both \openshmem and MPI as standalone libraries, +each of which allocates and initializes fully isolated communication +resources. +Another common approach +is to implement both \openshmem and MPI interfaces within the +same software system in order to share a communication resource when possible. + +To improve interoperability and portability in \openshmem + MPI hybrid +programming, we clarify the relevant semantics in the following subsections. + + +\subsection{Initialization} +In order to ensure that a hybrid program can be portably performed with different vendor +implementations, the \openshmem environment of the program must be initialized by +a call to \FUNC{shmem\_init} or \FUNC{shmem\_init\_thread} and be finalized by +a call to \FUNC{shmem\_finalize}; the MPI environment of the program must be initialized +by a call to \FUNC{MPI\_Init} or \FUNC{MPI\_Init\_thread} and be finalized by a +call to \FUNC{MPI\_Finalize}. + +\apiimpnotes{ +Portable implementations of OpenSHMEM and MPI must ensure that the initialization +calls can be made in an arbitrary order within a program; the same rule also +applies to the finalization calls. A software runtime that utilizes a shared +communication resource for \openshmem and MPI communication may maintain an +internal reference counter in order to ensure that the shared resource is +initialized only once and thus no shared resource is released until the last +finalization call is made. +} + + +\subsection{Dynamic Process Creation} +\label{subsec:interoperability:mpmd} + +MPI defines a dynamic process model that allows creation of processes after +an MPI application has started (e.g., by calling \FUNC{MPI\_Comm\_spawn}) and +connection to independent processes (e.g., through \FUNC{MPI\_Comm\_accept} +and \FUNC{MPI\_Comm\_connect}) +and provides a mechanism to establish communication +between the newly created processes and the existing MPI application (see +MPI standard version 3.1, Chapter 10). +Unlike MPI, \openshmem starts all processes at once and requires all PEs to +collectively allocate and initialize resources (e.g., symmetric heap) used by +the \openshmem library before any other \openshmem routine may +be called. Communicating with a dynamically created process in the \openshmem +environment may result in undefined behavior. +Hence, users should not use \openshmem and MPI dynamic process models +in the same program. + + +\subsection{Thread Safety} +\label{subsec:interoperability:thread} +Both \openshmem and MPI define the interaction with user threads in a program +with routines that can be used for initializing and querying the thread +environment. In a hybrid program, the user may request different thread levels +at the initialization calls of \openshmem and MPI environments; however, the +returned support level provided by the \openshmem library might be different +from that returned in an \openshmem-only program. For instance, the former +initialization call in a hybrid program may initialize a resource with the +user-requested thread level, but the supported level cannot be updated by the latter +initialization call if the underlying software runtime of \openshmem and MPI +share the same internal communication resource. +The program should always check the \VAR{provided} thread level returned +at the corresponding initialization call or query the level of thread support +after initialization to portably ensure thread support in each communication +environment. + +Both \openshmem and MPI define similar thread levels, namely, \VAR{THREAD\_SINGLE}, +\VAR{THREAD\_FUNNELED}, \VAR{THREAD\_SERIALIZED}, and \VAR{THREAD\_MULTIPLE}. +When requesting threading support in a hybrid program, however, +the following additional rules are applied if the implementations of \openshmem +and MPI share the same internal communication resource. +Users are strongly advised to always follow these rules to ensure program +portability. + +\begin{itemize} + \item The \VAR{THREAD\_SINGLE} thread level requires a single-threaded program. + Hence, users should not request \VAR{THREAD\_SINGLE} at the initialization + call of either \openshmem or MPI but request a different thread level at the + initialization call of the other model in the same program. + + \item The \VAR{THREAD\_FUNNELED} thread level allows only the main thread to + make communication calls. A hybrid program using the \VAR{THREAD\_FUNNELED} + thread level in both \openshmem and MPI should ensure that the same main thread + is used in both communication environments. + + \item The \VAR{THREAD\_SERIALIZED} thread level requires the program to ensure + that communication calls are not made concurrently by multiple threads. If a + hybrid program uses \VAR{THREAD\_SERIALIZED} in one communication environment + and \VAR{THREAD\_SERIALIZED} or \VAR{THREAD\_FUNNELED} in the other one, it + should also guarantee that the \openshmem and MPI calls are not made concurrently + from two distinct threads. +\end{itemize} + +\subsection{Mapping Process Identification Numbers} +\label{subsec:interoperability:id} + +Similar to the PE identifier in \openshmem, MPI defines rank as the +identification number of a process in a communicator. Both the \openshmem PE +and the MPI rank are unique integers assigned from zero to one less than the total +number of processes. In a hybrid program, the \openshmem +PE and the MPI rank in \VAR{MPI\_COMM\_WORLD} of a process can be equal. +This feature, however, may be provided by only some of the \openshmem and MPI +implementations (e.g., if both environments share the same underlying process +manager) and is not portably guaranteed. A portable program should always +use the standard functions in each model, namely, \FUNC{shmem\_my\_pe} in \openshmem +and \FUNC{MPI\_Comm\_rank} in MPI, to query the process identification numbers +in each communication environment and manage the mapping of identifiers in the +program when necessary. + +\subsubsection*{Example} +\label{subsubsec:interoperability:id:example} +The following example demonstrates how to manage the mapping between \openshmem +PE identifier and MPI ranks in \VAR{MPI\_COMM\_WORLD} in a hybrid \openshmem +and MPI program. + +\lstinputlisting[language={C}, tabsize=2, + basicstyle=\ttfamily\footnotesize] + {example_code/hybrid_mpi_mapping_id.c} + +\subsection{RMA Programming Models} +\label{subsec:interoperability:rma} + +\openshmem and MPI each define similar one-sided communication models; +however, a portable program should not assume interoperability between these +models. +For instance, \openshmem guarantees the atomicity only of concurrent \openshmem AMO operations +that operate on symmetric data with the same datatype. Access to the same symmetric +object with MPI atomic operations, such as an \FUNC{MPI\_Fetch\_and\_op}, may +result in an undefined result. Furthermore, +because most RMA programs can be written by using either \openshmem or MPI RMA, +users should choose only one of the RMA models in the same program, whenever +possible, for performance and code simplicity. + +\subsection{Communication Progress} +\label{subsec:interoperability:progress} + +\openshmem promises the progression of communication both with and without +\openshmem calls and requires the software progress mechanism in the implementation +(e.g., a progress thread) when the hardware does not provide asynchronous communication +capabilities. In MPI, however, a weak progress semantics is applied. That is, +an MPI communication call is guaranteed only to complete in finite time. For +instance, an \FUNC{MPI\_Put} may be completed only when the remote process makes an MPI +call that internally triggers the progress of MPI, if the underlying hardware +does not support asynchronous communication. A hybrid program +should not assume that the \openshmem library also makes progress for MPI. +A call to \FUNC{shmem\_query\_interoperability} with the \VAR{SHMEM\_PROGRESS\_MPI} +property (see definition in \ref{subsec:interoperability:query}) +can be used to portably check whether the implementation provides asynchronous +progression also for MPI. If it is not provided, the user program may have to +explicitly manage the asynchronous communication in MPI in +order to prevent any deadlock or performance degradation. + +\apiimpnotes{ +Implementations that provide both \openshmem and MPI interfaces should try +to ensure progress for both models, when necessary and possible, for performance +reasons. For instance, an implementation +may utilize a software progress thread to process any software-handled +communication requests, after the user program has called +\FUNC{shmem\_init} and \FUNC{MPI\_Init} provided by the same system. +} + + +\section{Query Interoperability} + +A hybrid user program can query the interoperability feature of an \openshmem +implementation in order to avoid unnecessary overhead and programming complexity. +For instance, the user program can eliminate manual progress polling for MPI +communication if the \openshmem implementation guarantees asynchronous +communication also for MPI. + +\subsection{\textbf{SHMEM\_QUERY\_INTEROPERABILITY}} +\label{subsec:interoperability:query} +\input{content/shmem_query_interoperability} \ No newline at end of file diff --git a/content/library_constants.tex b/content/library_constants.tex index 703f3e48e..e843573fa 100644 --- a/content/library_constants.tex +++ b/content/library_constants.tex @@ -39,6 +39,23 @@ See Section~\ref{subsec:thread_support} for more detail about its use. \tabularnewline \hline %% +\LibConstDecl{SHMEM\_TEAM\_NUM\_CONTEXTS} & +The bitwise flag which specifies that a team creation routine should use the +\VAR{num\_contexts} member of the provided +\CTYPE{shmem\_team\_config\_t} configuration parameter as a request. +See Sections~\ref{subsec:shmem_team_config_t} and +\ref{subsec:shmem_team_split_strided} for more detail about its use. +\tabularnewline \hline +%% +\LibConstDecl{SHMEM\_TEAM\_INVALID} & +A value corresponding to an invalid team. +This value can be used to initialize or update team handles to indicate +that they do not reference a valid team. +When managed in this way, applications can use an equality comparison +to test whether a given team handle references a valid team. +See Section~\ref{subsec:team} for more detail about its use. +\tabularnewline \hline +%% \LibConstDecl{SHMEM\_CTX\_INVALID} & A value corresponding to an invalid communication context. This value can be used to initialize or update context handles to indicate @@ -67,6 +84,18 @@ See Section~\ref{subsec:shmem_ctx_create} for more detail about its use. \tabularnewline \hline %% +\LibConstDecl{SHMEM\_SIGNAL\_SET} & +An integer constant expression corresponding to the signal update set operation. +See Section~\ref{subsec:shmem_put_signal} and +Section~\ref{subsec:shmem_put_signal_nbi} for more detail about its use. +\tabularnewline \hline +%% +\LibConstDecl{SHMEM\_SIGNAL\_ADD} & +An integer constant expression corresponding to the signal update add operation. +See Section~\ref{subsec:shmem_put_signal} and +Section~\ref{subsec:shmem_put_signal_nbi} for more detail about its use. +\tabularnewline \hline +%% \LibConstDecl{SHMEM\_SYNC\_VALUE} \begin{DeprecateBlock} \LibConstDecl{\_SHMEM\_SYNC\_VALUE} diff --git a/content/library_handles.tex b/content/library_handles.tex index d2ec45a48..2f674b3c3 100644 --- a/content/library_handles.tex +++ b/content/library_handles.tex @@ -13,6 +13,24 @@ \tabularnewline \hline \endhead %% +\LibHandleDecl{SHMEM\_TEAM\_WORLD} & +Handle of type \CTYPE{shmem\_team\_t} that corresponds to the +default team of all \acp{PE} in the \openshmem program. All point-to-point +communication operations and collective synchronizations that do not specify a team +are performed on the default team. +See Section~\ref{subsec:team} for more detail about its use. +\tabularnewline \hline +%% +\LibHandleDecl{SHMEM\_TEAM\_SHARED} & +Handle of type \CTYPE{shmem\_team\_t} that corresponds to a team of \acp{PE} +that share a memory domain. When this handle is used by some \ac{PE}, +it will refer to the team of all \acp{PE} that would return a non-null +pointer from \FUNC{shmem\_ptr} for symmetric objects on that \ac{PE}, +and vice versa. This means that symmetric objects on each \ac{PE} are +directly load/store accessible by all \acp{PE} in the team. +See Section~\ref{subsec:team} for more detail about its use. +\tabularnewline \hline +%% \LibHandleDecl{SHMEM\_CTX\_DEFAULT} & Handle of type \CTYPE{shmem\_ctx\_t} that corresponds to the default communication context. All point-to-point communication operations diff --git a/content/memory_model.tex b/content/memory_model.tex index eb019da1e..82fde3ed5 100644 --- a/content/memory_model.tex +++ b/content/memory_model.tex @@ -51,6 +51,9 @@ \subsection{Atomicity Guarantees}\label{subsec:amo_guarantees} guarantee that concurrent accesses by any of these routines to the same location and using the same datatype (specified in Tables~\ref{stdamotypes} and \ref{extamotypes}) will be exclusive. +Exclusivity is also guaranteed when the target \ac{PE} performs a wait or test +operation on the same location and with the same datatype as one or more atomic +operations. \openshmem atomic operations do not guarantee exclusivity in the following scenarios, all of which result in undefined behavior. \begin{enumerate} diff --git a/content/p2p_sync_intro.tex b/content/p2p_sync_intro.tex index d30428318..f2132e0bc 100644 --- a/content/p2p_sync_intro.tex +++ b/content/p2p_sync_intro.tex @@ -3,8 +3,8 @@ object. The point-to-point synchronization routines can be used to portably ensure that memory access operations observe remote updates in the order enforced by -the initiator \ac{PE} using the \FUNC{shmem\_fence} and \FUNC{shmem\_quiet} -routines. +the initiator \ac{PE} using the \OPR{put-with-signal}, \FUNC{shmem\_fence} and +\FUNC{shmem\_quiet} routines. Where appropriate compiler support is available, \openshmem provides type-generic point-to-point synchronization interfaces via \Cstd[11] generic diff --git a/content/programming_model_overview.tex b/content/programming_model_overview.tex index d6c143c60..5d74a0aeb 100644 --- a/content/programming_model_overview.tex +++ b/content/programming_model_overview.tex @@ -107,6 +107,11 @@ \item \OPR{Barrier}: All or some \acp{PE} collectively synchronize and ensure completion of all remote and local updates prior to any \ac{PE} returning from the call. + \item \OPR{Wait and Test}: A PE calling a point-to-point synchronization + routine ensures the value of a local symmetric object meets a specified + condition. Wait operations block until the specified condition is + met, whereas test operations return immediately and indicate whether or + not the specified condition is met. \end{enumerate} \item \textbf{Collective Communication} diff --git a/content/rma_intro.tex b/content/rma_intro.tex index af4a3dacc..3c912e1f9 100644 --- a/content/rma_intro.tex +++ b/content/rma_intro.tex @@ -16,6 +16,14 @@ routine, \GET{}, the origin \ac{PE} provides the \dest{} data object and the destination \ac{PE} provides the \source{} data object. +The destination \ac{PE} is specified as an integer representing the \ac{PE} number. +This \ac{PE} number is relative to the team associated with the +communication context being using for the operation. If no context argument is passed to the routine, +then the routine operates on the default context, which implies that +the \ac{PE} number is relative to the default team. +If the \ac{PE} number passed to the routine is invalid, being negative +or greater than or equal to the size of the \openshmem team, then the behavior is undefined. + Where appropriate compiler support is available, \openshmem provides type-generic one-sided communication interfaces via \Cstd[11] generic selection (\Cstd[11]~\S6.5.1.1\footnote{Formally, the \Cstd[11] specification is ISO/IEC 9899:2011(E).}) diff --git a/content/shmem_alltoall.tex b/content/shmem_alltoall.tex index 6657d05ad..b5dcfa61b 100644 --- a/content/shmem_alltoall.tex +++ b/content/shmem_alltoall.tex @@ -1,17 +1,37 @@ \apisummary{ - shmem\_alltoall is a collective routine where each \ac{PE} exchanges a fixed amount of data with all other \acp{PE} in the - active set. + shmem\_alltoall is a collective routine where each \ac{PE} exchanges a fixed amount of data with all other \acp{PE} participating in the collective. } \begin{apidefinition} +%% C11 +\begin{C11synopsis} +int @\FuncDecl{shmem\_alltoall}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nelems); +\end{C11synopsis} +where \TYPE{} is one of the standard \ac{RMA} types specified by Table \ref{stdrmatypes}. + \begin{Csynopsis} +\end{Csynopsis} +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_alltoall}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nelems); +\end{CsynopsisCol} +where \TYPE{} is one of the standard \ac{RMA} types and has a corresponding \TYPENAME{} specified by Table \ref{stdrmatypes}. + +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_alltoallmem}@(shmem_team_t team, void *dest, const void *source, size_t nelems); +\end{CsynopsisCol} + +\begin{DeprecateBlock} +\begin{CsynopsisCol} void @\FuncDecl{shmem\_alltoall32}@(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, int PE_size, long *pSync); void @\FuncDecl{shmem\_alltoall64}@(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, int PE_size, long *pSync); -\end{Csynopsis} +\end{CsynopsisCol} +\end{DeprecateBlock} \begin{apiarguments} +\apiargument{IN}{team}{A valid \openshmem team handle to a team.} + \apiargument{OUT}{dest}{A symmetric data object large enough to receive the combined total of \VAR{nelems} elements from each \ac{PE} in the active set.} @@ -20,6 +40,8 @@ destination \ac{PE}.} \apiargument{IN}{nelems}{The number of elements to exchange for each \ac{PE}. \VAR{nelems} must be of type size\_t for \CorCpp.} + +\begin{DeprecateBlock} \apiargument{IN}{PE\_start}{The lowest \ac{PE} number of the active set of \acp{PE}. \VAR{PE\_start} must be of type integer.} \apiargument{IN}{logPE\_stride}{The log (base 2) of the stride between @@ -33,49 +55,81 @@ Every element of this array must be initialized with the value \CONST{SHMEM\_SYNC\_VALUE} before any of the \acp{PE} in the active set enter the routine.} +\end{DeprecateBlock} \end{apiarguments} \apidescription{ The \FUNC{shmem\_alltoall} routines are collective routines. Each \ac{PE} - in the active set exchanges \VAR{nelems} data elements of size - 32 bits (for \FUNC{shmem\_alltoall32}) or 64 bits (for \FUNC{shmem\_alltoall64}) - with all other \acp{PE} in the set. The data being sent and received are + participating in the operation exchanges \VAR{nelems} data elements + with all other \acp{PE} participating in the operation. + The size of a data element is: + \begin{itemize} + \item 32 bits for \FUNC{shmem\_alltoall32} + \item 64 bits for \FUNC{shmem\_alltoall64} + \item 8 bits for \FUNC{shmem\_alltoallmem} + \item \FUNC{sizeof}(\TYPE{}) for alltoall routines taking typed \VAR{source} and \VAR{dest} + \end{itemize} + + The data being sent and received are stored in a contiguous symmetric data object. The total size of each \acp{PE} \VAR{source} object and \VAR{dest} object is \VAR{nelems} times the size of - an element (32 bits or 64 bits) times \VAR{PE\_size}. - The \VAR{source} object contains \VAR{PE\_size} blocks of data (the size of each - block defined by \VAR{nelems}) and each block of data is sent to a different \ac{PE}. - Given a \ac{PE} \VAR{i} that is the \kth PE in the active set and a \ac{PE} - \VAR{j} that is the \lth \ac{PE} in the active set, + an element + times \VAR{N}, where \VAR{N} equals the number of \acp{PE} participating + in the operation. + The \VAR{source} object contains \VAR{N} blocks of data + (where the size of each block is defined by \VAR{nelems}) and each block of data + is sent to a different \ac{PE}. + + The same \dest{} and \source{} + arrays, and same value for nelems + must be passed by all \acp{PE} that participate in the collective. + + Given a \ac{PE} \VAR{i} that is the \kth \ac{PE} + participating in the operation and a \ac{PE} + \VAR{j} that is the \lth \ac{PE} + participating in the operation, + \ac{PE} \VAR{i} sends the \lth block of its \VAR{source} object to the \kth block of the \VAR{dest} object of \ac{PE} \VAR{j}. - As with all \openshmem collective routines, this routine assumes - that only \acp{PE} in the active set call the routine. If a \ac{PE} not - in the active set calls an \openshmem collective routine, + Team-based collect routines operate over all \acp{PE} in the provided team + argument. All \acp{PE} in the provided team must participate in the collective. + + Active-set-based collective routines operate over all \acp{PE} in the active set + defined by the \VAR{PE\_start}, \VAR{logPE\_stride}, \VAR{PE\_size} triplet. + + As with all active-set-based collective routines, + this routine assumes that only \acp{PE} in the active set call the routine. + If a \ac{PE} not in the active set calls an + active-set-based collective routine, the behavior is undefined. - The values of arguments \VAR{nelems}, \VAR{PE\_start}, \VAR{logPE\_stride}, + The values of arguments \VAR{PE\_start}, \VAR{logPE\_stride}, and \VAR{PE\_size} must be equal on all \acp{PE} in the active set. The same - \VAR{dest} and \VAR{source} data objects, and the same \VAR{pSync} work + \VAR{pSync} work array must be passed to all \acp{PE} in the active set. Before any \ac{PE} calls a \FUNC{shmem\_alltoall} routine, the following conditions must be ensured: \begin{itemize} - \item The \VAR{pSync} array on all \acp{PE} in the active set is not - still in use from a prior call to a \FUNC{shmem\_alltoall} routine. \item The \VAR{dest} data object on all \acp{PE} in the active set is ready to accept the \FUNC{shmem\_alltoall} data. + \item For active-set-based routines, the \VAR{pSync} array + on all \acp{PE} in the active set is not still in use from a prior call + to a \FUNC{shmem\_alltoall} routine. \end{itemize} Otherwise, the behavior is undefined. Upon return from a \FUNC{shmem\_alltoall} routine, the following is true for - the local PE: Its \VAR{dest} symmetric data object is completely updated and + the local PE: + \begin{itemize} + \item Its \VAR{dest} symmetric data object is completely updated and the data has been copied out of the \VAR{source} data object. - The values in the \VAR{pSync} array are restored to the original values. + \item For active-set-based routines, + the values in the \VAR{pSync} array are restored to the original values. + \end{itemize} } \apidesctable{ @@ -87,7 +141,7 @@ \apitablerow{shmem\_alltoall32}{\CONST{32} bits aligned.} \apireturnvalues{ - None. + Zero on successful local completion. Nonzero otherwise. } \apinotes{ @@ -111,7 +165,7 @@ \begin{apiexamples} \apicexample - {This example shows a \FUNC{shmem\_alltoall64} on two long elements among all + {This \CorCpp{} example shows a \FUNC{shmem\_int64\_alltoall} on two 64-bit integers among all \acp{PE}.} {./example_code/shmem_alltoall_example.c} {} diff --git a/content/shmem_alltoalls.tex b/content/shmem_alltoalls.tex index f6cebaf37..53e034f7f 100644 --- a/content/shmem_alltoalls.tex +++ b/content/shmem_alltoalls.tex @@ -1,18 +1,38 @@ \apisummary{ - shmem\_alltoalls is a collective routine where each \ac{PE} exchanges a fixed amount of strided data with all other - \acp{PE} in the active set. + shmem\_alltoalls is a collective routine where each \ac{PE} exchanges a fixed amount of strided data with all other \acp{PE} participating in the collective. } \begin{apidefinition} +%% C11 +\begin{C11synopsis} +int @\FuncDecl{shmem\_alltoalls}@(shmem_team_t team, TYPE *dest, const TYPE *source, ptrdiff_t dst, ptrdiff_t sst, size_t nelems); +\end{C11synopsis} +where \TYPE{} is one of the standard \ac{RMA} types specified by Table \ref{stdrmatypes}. + \begin{Csynopsis} +\end{Csynopsis} +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_alltoalls}@(shmem_team_t team, TYPE *dest, const TYPE *source, ptrdiff_t dst, ptrdiff_t sst, size_t nelems); +\end{CsynopsisCol} +where \TYPE{} is one of the standard \ac{RMA} types and has a corresponding \TYPENAME{} specified by Table \ref{stdrmatypes}. + +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_alltoallsmem}@(shmem_team_t team, void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t nelems); +\end{CsynopsisCol} + +\begin{DeprecateBlock} +\begin{CsynopsisCol} void @\FuncDecl{shmem\_alltoalls32}@(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t nelems, int PE_start, int logPE_stride, int PE_size, long *pSync); void @\FuncDecl{shmem\_alltoalls64}@(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t nelems, int PE_start, int logPE_stride, int PE_size, long *pSync); -\end{Csynopsis} +\end{CsynopsisCol} +\end{DeprecateBlock} \begin{apiarguments} -\apiargument{OUT}{dest}{A symmetric data object large enough to receive +\apiargument{IN}{team}{A valid \openshmem team handle.} + +\apiargument{OUT}{dest}{A symmetric data object large enough to receive the combined total of \VAR{nelems} elements from each \ac{PE} in the active set.} \apiargument{IN}{source}{A symmetric data object that contains \VAR{nelems} @@ -25,7 +45,9 @@ \apiargument{IN}{sst}{The stride between consecutive elements of the \source{} data object. The stride is scaled by the element size. A value of \CONST{1} indicates contiguous data. \VAR{sst} must be - of type \CTYPE{ptrdiff\_t}.} + \CTYPE{ptrdiff\_t}.} + +\begin{DeprecateBlock} \apiargument{IN}{nelems}{The number of elements to exchange for each \ac{PE}. \VAR{nelems} must be of type size\_t for \CorCpp.} \apiargument{IN}{PE\_start}{The lowest \ac{PE} number of the active set of @@ -41,81 +63,59 @@ Every element of this array must be initialized with the value \CONST{SHMEM\_SYNC\_VALUE} before any of the \acp{PE} in the active set enter the routine.} +\end{DeprecateBlock} \end{apiarguments} \apidescription{ - The \FUNC{shmem\_alltoalls} routines are collective routines. Each \ac{PE} - in the active set exchanges \VAR{nelems} strided data elements of size - 32 bits (for \FUNC{shmem\_alltoalls32}) or 64 bits (for \FUNC{shmem\_alltoalls64}) - with all other \acp{PE} in the set. Both strides, \VAR{dst} and \VAR{sst}, must be greater + The \FUNC{shmem\_alltoalls} routines are collective routines. + These routines are equivalent in functionality to the corresponding + \FUNC{shmem\_alltoall} routines except that they add explicit stride values + for accessing the source and destination data arrays, whereas the array + access in \FUNC{shmem\_alltoall} is always with a stride of \CONST{1}. + + Each \ac{PE} participating in the operation + exchanges \VAR{nelems} strided data elements + with all other \acp{PE} participating in the operation. + Both strides, \VAR{dst} and \VAR{sst}, must be greater than or equal to \CONST{1}. - Given a \ac{PE} \VAR{i} that is the \kth PE in the active set and a \ac{PE} - \VAR{j} that is the \lth \ac{PE} in the active set, + + The same \dest{} and \source{} arrays and same values for values of + arguments \VAR{dst}, \VAR{sst}, \VAR{nelems} must be passed by all \acp{PE} + that participate in the collective. + + Given a \ac{PE} \VAR{i} that is the \kth \ac{PE} + participating in the operation and a \ac{PE} + \VAR{j} that is the \lth \ac{PE} + participating in the operation \ac{PE} \VAR{i} sends the \VAR{sst}*\lth block of the \VAR{source} data object to the \VAR{dst}*\kth block of the \VAR{dest} data object on \ac{PE} \VAR{j}. - As with all \openshmem collective routines, these routines assume - that only \acp{PE} in the active set call the routine. If a \ac{PE} not - in the active set calls an \openshmem collective routine, undefined - behavior results. - - The values of arguments \VAR{dst}, \VAR{sst}, \VAR{nelems}, \VAR{PE\_start}, - \VAR{logPE\_stride}, and \VAR{PE\_size} must be equal on all \acp{PE} in the - active set. The same \VAR{dest} and \VAR{source} data objects, and the same - \VAR{pSync} work array must be passed to all \acp{PE} in the active set. - - Before any \ac{PE} calls a \FUNC{shmem\_alltoalls} routine, - the following conditions must be ensured: + See the description of \FUNC{shmem\_alltoall} in + Section~\ref{subsec:shmem_alltoall} for: \begin{itemize} - \item The \VAR{pSync} array on all \acp{PE} in the active set is not - still in use from a prior call to a \FUNC{shmem\_alltoall} routine. - \item The \VAR{dest} data object on all \acp{PE} in the active set is - ready to accept the \FUNC{shmem\_alltoalls} data. + \item Data element sizes for the different sized and typed \FUNC{shmem\_alltoalls} variants. + \item Rules for \ac{PE} participation in the collective routine. + \item The pre- and post-conditions for symmetric objects. + \item Typing constraints for \dest{} and \source{} data objects. \end{itemize} - Otherwise, the behavior is undefined. - - Upon return from a \FUNC{shmem\_alltoalls} routine, the following is true for - the local PE: Its \VAR{dest} symmetric data object is completely updated and - the data has been copied out of the \VAR{source} data object. - The values in the \VAR{pSync} array are restored to the original values. -} - -\apidesctable{ -The \dest{} and \source{} data objects must conform to certain typing -constraints, which are as follows: -}{Routine}{Data type of \VAR{dest} and \VAR{source}} + +} -\apitablerow{shmem\_alltoalls64}{\CONST{64} bits aligned.} -\apitablerow{shmem\_alltoalls32}{\CONST{32} bits aligned.} \apireturnvalues{ - None. + Zero on successful local completion. Nonzero otherwise. } \apinotes{ - This routine restores \VAR{pSync} to its original contents. Multiple calls - to \openshmem\ routines that use the same \VAR{pSync} array do not require - that \VAR{pSync} be reinitialized after the first call. - The user must ensure that the \VAR{pSync} array is not being updated by any - \ac{PE} in the active set while any of the \acp{PE} participates in - processing of an \openshmem\ \FUNC{shmem\_alltoalls} routine. Be careful to - avoid these situations: If the \VAR{pSync} array is initialized at run time, - some type of synchronization is needed to ensure that all \acp{PE} in the - active set have initialized \VAR{pSync} before any of them enter an - \openshmem\ routine called with the \VAR{pSync} synchronization array. A - \VAR{pSync} array may be reused on a subsequent \openshmem\ - \FUNC{shmem\_alltoalls} routine only if none of the \acp{PE} in the - active set are still processing a prior \openshmem\ \FUNC{shmem\_alltoalls} - routine call that used the same \VAR{pSync} array. In general, this can be - ensured only by doing some type of synchronization. + See notes for \FUNC{shmem\_alltoall} in Section~\ref{subsec:shmem_alltoall}. } \begin{apiexamples} \apicexample - {This example shows a \FUNC{shmem\_alltoalls64} on two long elements among + {This \CorCpp{} example shows a \FUNC{shmem\_int64\_alltoalls} on two 64-bit integers among all \acp{PE}.} {./example_code/shmem_alltoalls_example.c} {} diff --git a/content/shmem_atomic_compare_swap_nbi.tex b/content/shmem_atomic_compare_swap_nbi.tex new file mode 100644 index 000000000..467b9e015 --- /dev/null +++ b/content/shmem_atomic_compare_swap_nbi.tex @@ -0,0 +1,59 @@ +\apisummary{ + The nonblocking atomic routine provides a method for performing an atomic + conditional swap on a remote data object. +} + +\begin{apidefinition} + +\begin{C11synopsis} +void @\FuncDecl{shmem\_atomic\_compare\_swap\_nbi}@(TYPE *fetch, TYPE *dest, TYPE cond, TYPE value, int pe); +void @\FuncDecl{shmem\_atomic\_compare\_swap\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, TYPE *dest, TYPE cond, TYPE value, int pe); +\end{C11synopsis} +where \TYPE{} is one of the standard \ac{AMO} types specified by +Table~\ref{stdamotypes}. + +\begin{Csynopsis} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_atomic\_compare\_swap\_nbi}@(TYPE *fetch, TYPE *dest, TYPE cond, TYPE value, int pe); +void @\FuncDecl{shmem\_ctx\_\FuncParam{TYPENAME}\_atomic\_compare\_swap\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, TYPE *dest, TYPE cond, TYPE value, int pe); +\end{Csynopsis} +where \TYPE{} is one of the standard \ac{AMO} types and has a corresponding +\TYPENAME{} specified by Table~\ref{stdamotypes}. + +\begin{apiarguments} + \apiargument{IN}{ctx}{A context handle specifying the context on which to + perform the operation. When this argument is not provided, the operation + is performed on the default context.} + \apiargument{OUT}{fetch}{Local data object to be updated.} + \apiargument{OUT}{dest}{The remotely accessible data object to be updated on + the remote \ac{PE}. } + \apiargument{IN}{cond}{\VAR{cond} is compared to the remote \VAR{dest} + value. If \VAR{cond} and the remote \VAR{dest} are equal, then \VAR{value} + is swapped into the remote \VAR{dest}; otherwise, the remote \VAR{dest} is + unchanged.} + \apiargument{IN}{value}{The value to be atomically written to the remote + \ac{PE}. } + \apiargument{IN}{pe}{An integer that indicates the \ac{PE} number upon which + \VAR{dest} is to be updated.} +\end{apiarguments} + +\apidescription{ + The nonblocking conditional swap routines conditionally update a \VAR{dest} + data object on the specified \ac{PE} as an atomic operation and fetches the prior contents of the + \VAR{dest} data object into the \VAR{fetch} local data object. + This routine returns after initiating the operation. The operation + is considered complete after a subsequent call to \FUNC{shmem\_quiet}. At the + completion of \FUNC{shmem\_quiet}, prior contents of the \VAR{dest} data + object have been fetched into \VAR{fetch} local data object and the + contents of \VAR{value} have been conditionally updated into \VAR{dest} on the + remote \ac{PE}. +} + +\apireturnvalues{ + None. +} + +\apinotes{ + None. +} + +\end{apidefinition} diff --git a/content/shmem_atomic_fetch_add_nbi.tex b/content/shmem_atomic_fetch_add_nbi.tex new file mode 100644 index 000000000..4efce200b --- /dev/null +++ b/content/shmem_atomic_fetch_add_nbi.tex @@ -0,0 +1,56 @@ +\apisummary{ + The nonblocking atomic routine performs an atomic fetch-and-add operation on + a remote data object. +} + +\begin{apidefinition} + +\begin{C11synopsis} +void @\FuncDecl{shmem\_atomic\_fetch\_add\_nbi}@(TYPE *fetch, TYPE *dest, TYPE value, int pe); +void @\FuncDecl{shmem\_atomic\_fetch\_add\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, TYPE *dest, TYPE value, int pe); +\end{C11synopsis} +where \TYPE{} is one of the standard \ac{AMO} types specified by +Table~\ref{stdamotypes}. + +\begin{Csynopsis} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_atomic\_fetch\_add\_nbi}@(TYPE *fetch, TYPE *dest, TYPE value, int pe); +void @\FuncDecl{shmem\_ctx\_\FuncParam{TYPENAME}\_atomic\_fetch\_add\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, TYPE *dest, TYPE value, int pe); +\end{Csynopsis} +where \TYPE{} is one of the standard \ac{AMO} types and has a corresponding +\TYPENAME{} specified by Table~\ref{stdamotypes}. + +\begin{apiarguments} + + \apiargument{IN}{ctx}{A context handle specifying the context on which to + perform the operation. When this argument is not provided, the operation + is performed on the default context.} + \apiargument{OUT}{fetch}{Local data object to be updated.} + \apiargument{OUT}{dest}{The remotely accessible data object to be updated on + the remote \ac{PE}.} + \apiargument{IN}{value}{The value to be atomically added to \VAR{dest}.} + \apiargument{IN}{pe}{An integer that indicates the \ac{PE} number on which + \VAR{dest} is to be updated.} + +\end{apiarguments} + +\apidescription{ + The nonblocking \FUNC{shmem\_atomic\_fetch\_add\_nbi} routines perform an + atomic fetch-and-add operation. An atomic fetch-and-add operation fetches + the old \VAR{dest} and adds \VAR{value} to \VAR{dest} without the + possibility of another atomic operation on the \VAR{dest} between the time + of the fetch and the update. This routine returns after initiating the + operation. The operation is considered complete after a subsequent call to + \FUNC{shmem\_quiet}. At the completion of \FUNC{shmem\_quiet}, \VAR{value} + has been added to \VAR{dest} on \VAR{pe} and the prior contents of \VAR{dest} + fetched into the \VAR{fetch} local data object. +} + +\apireturnvalues{ + None. +} + +\apinotes{ + None. +} + +\end{apidefinition} diff --git a/content/shmem_atomic_fetch_and_nbi.tex b/content/shmem_atomic_fetch_and_nbi.tex new file mode 100644 index 000000000..925d75be5 --- /dev/null +++ b/content/shmem_atomic_fetch_and_nbi.tex @@ -0,0 +1,55 @@ +\apisummary{ + This nonblocking atomic operation performs an atomic fetching bitwise AND + operation on a remote data object. +} + +\begin{apidefinition} + +\begin{C11synopsis} +void @\FuncDecl{shmem\_atomic\_fetch\_and\_nbi}@(TYPE *fetch, TYPE *dest, TYPE value, int pe); +void @\FuncDecl{shmem\_atomic\_fetch\_and\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, TYPE *dest, TYPE value, int pe); +\end{C11synopsis} +where \TYPE{} is one of the bitwise \ac{AMO} types specified by +Table~\ref{bitamotypes}. + +\begin{Csynopsis} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_atomic\_fetch\_and\_nbi}@(TYPE *fetch, TYPE *dest, TYPE value, int pe); +void @\FuncDecl{shmem\_ctx\_\FuncParam{TYPENAME}\_atomic\_fetch\_and\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, TYPE *dest, TYPE value, int pe); +\end{Csynopsis} +where \TYPE{} is one of the bitwise \ac{AMO} types and has a corresponding +\TYPENAME{} specified by Table~\ref{bitamotypes}. + +\begin{apiarguments} + + \apiargument{IN}{ctx}{A context handle specifying the context on which to + perform the operation. When this argument is not provided, the + operation is performed on the default context.} + \apiargument{OUT}{fetch}{Local data object to be updated.} + \apiargument{OUT}{dest}{A pointer to the remotely accessible data object to + be updated.} + \apiargument{IN}{value}{The operand to the bitwise AND operation.} + \apiargument{IN}{pe}{An integer value for the \ac{PE} on which \VAR{dest} + is to be updated.} + +\end{apiarguments} + +\apidescription{ + The nonblocking \FUNC{shmem\_atomic\_fetch\_and\_nbi} routines perform an + atomic fetching bitwise AND on the remotely accessible data object pointed + by \VAR{dest} at \ac{PE} \VAR{pe} with the operand \VAR{value}. This routine + returns after initiating the operation. The operation is considered complete + after a subsequent call to \FUNC{shmem\_quiet}. At the completion of + \FUNC{shmem\_quiet}, these routines have performed a fetching bitwise AND on + \VAR{dest} at \ac{PE} \VAR{pe} with the operand \VAR{value} and fetched the prior + contents of \VAR{dest} into the \VAR{fetch} local data object. +} + +\apireturnvalues{ + None. +} + +\apinotes{ + None. +} + +\end{apidefinition} diff --git a/content/shmem_atomic_fetch_inc_nbi.tex b/content/shmem_atomic_fetch_inc_nbi.tex new file mode 100644 index 000000000..c960c5efd --- /dev/null +++ b/content/shmem_atomic_fetch_inc_nbi.tex @@ -0,0 +1,53 @@ +\apisummary{ + This nonblocking atomic routine performs an atomic fetch-and-increment + operation on a remote data object. +} + +\begin{apidefinition} + +\begin{C11synopsis} +void @\FuncDecl{shmem\_atomic\_fetch\_inc\_nbi}@(TYPE *fetch, TYPE *dest, int pe); +void @\FuncDecl{shmem\_atomic\_fetch\_inc\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, TYPE *dest, int pe); +\end{C11synopsis} +where \TYPE{} is one of the standard \ac{AMO} types specified by +Table~\ref{stdamotypes}. + +\begin{Csynopsis} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_atomic\_fetch\_inc\_nbi}@(TYPE *fetch, TYPE *dest, int pe); +void @\FuncDecl{shmem\_ctx\_\FuncParam{TYPENAME}\_atomic\_fetch\_inc\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, TYPE *dest, int pe); +\end{Csynopsis} +where \TYPE{} is one of the standard \ac{AMO} types and has a corresponding +\TYPENAME{} specified by Table~\ref{stdamotypes}. + +\begin{apiarguments} + +\apiargument{IN}{ctx}{A context handle specifying the context on which to + perform the operation. When this argument is not provided, the + operation is performed on the default context.} +\apiargument{OUT}{fetch}{Local data object to be updated.} +\apiargument{OUT}{dest}{The remotely accessible data object to be updated on the + remote \ac{PE}.} +\apiargument{IN}{pe}{An integer that indicates the \ac{PE} number on which + \dest{} is to be updated.} + +\end{apiarguments} + + +\apidescription{ + The nonblocking \FUNC{shmem\_atomic\_fetch\_inc\_nbi} routines perform an + atomic fetch-and-increment operation. This routine returns after initiating the + operation. The operation is considered complete after a subsequent call to + \FUNC{shmem\_quiet}. At the completion of \FUNC{shmem\_quiet}, \dest{} on + \ac{PE} \VAR{pe} has been increased by one and the previous contents of \dest{} + fetched into the \VAR{fetch} local data object. +} + +\apireturnvalues{ + None. +} + +\apinotes{ + None. +} + +\end{apidefinition} diff --git a/content/shmem_atomic_fetch_nbi.tex b/content/shmem_atomic_fetch_nbi.tex new file mode 100644 index 000000000..a816bd01c --- /dev/null +++ b/content/shmem_atomic_fetch_nbi.tex @@ -0,0 +1,52 @@ +\apisummary{ + The nonblocking atomic fetch routine provides a method for atomically + fetching the value of a remote data object. +} + +\begin{apidefinition} + +\begin{C11synopsis} +void @\FuncDecl{shmem\_atomic\_fetch\_nbi}@(TYPE *fetch, const TYPE *source, int pe); +void @\FuncDecl{shmem\_atomic\_fetch\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, const TYPE *source, int pe); +\end{C11synopsis} +where \TYPE{} is one of the extended \ac{AMO} types specified by +Table~\ref{extamotypes}. + +\begin{Csynopsis} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_atomic\_fetch\_nbi}@(TYPE *fetch, const TYPE *source, int pe); +void @\FuncDecl{shmem\_ctx\_\FuncParam{TYPENAME}\_atomic\_fetch\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, const TYPE *source, int pe); +\end{Csynopsis} +where \TYPE{} is one of the extended \ac{AMO} types and has a corresponding +\TYPENAME{} specified by Table~\ref{extamotypes}. + +\begin{apiarguments} + + \apiargument{IN}{ctx}{A context handle specifying the context on which to + perform the operation. When this argument is not provided, the operation is + performed on the default context.} + \apiargument{OUT}{fetch}{Local data object to be updated.} + \apiargument{IN}{source}{The remotely accessible data object to be fetched + from the remote \ac{PE}.} + \apiargument{IN}{pe}{An integer that indicates the \ac{PE} number from which + \VAR{source} is to be fetched.} + +\end{apiarguments} + +\apidescription{ + The nonblocking atomic fetch routines perform a nonblocking fetch of a + value atomically from a remote data object. This routine returns after + initiating the operation. The operation is considered complete after a + subsequent call to \FUNC{shmem\_quiet}. At the completion of + \FUNC{shmem\_quiet}, contents of the \source{} data object from \ac{PE} has been + fetched into \VAR{fetch} local data object. +} + +\apireturnvalues{ + None. +} + +\apinotes{ + None. +} + +\end{apidefinition} diff --git a/content/shmem_atomic_fetch_or_nbi.tex b/content/shmem_atomic_fetch_or_nbi.tex new file mode 100644 index 000000000..5ce26bb6c --- /dev/null +++ b/content/shmem_atomic_fetch_or_nbi.tex @@ -0,0 +1,55 @@ +\apisummary{ + This nonblocking atomic operation performs an atomic fetching bitwise OR + operation on a remote data object. +} + +\begin{apidefinition} + +\begin{C11synopsis} +void @\FuncDecl{shmem\_atomic\_fetch\_or\_nbi}@(TYPE *fetch, TYPE *dest, TYPE value, int pe); +void @\FuncDecl{shmem\_atomic\_fetch\_or\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, TYPE *dest, TYPE value, int pe); +\end{C11synopsis} +where \TYPE{} is one of the bitwise \ac{AMO} types specified by +Table~\ref{bitamotypes}. + +\begin{Csynopsis} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_atomic\_fetch\_or\_nbi}@(TYPE *fetch, TYPE *dest, TYPE value, int pe); +void @\FuncDecl{shmem\_ctx\_\FuncParam{TYPENAME}\_atomic\_fetch\_or\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, TYPE *dest, TYPE value, int pe); +\end{Csynopsis} +where \TYPE{} is one of the bitwise \ac{AMO} types and has a corresponding +\TYPENAME{} specified by Table~\ref{bitamotypes}. + +\begin{apiarguments} + + \apiargument{IN}{ctx}{A context handle specifying the context on which to + perform the operation. When this argument is not provided, the + operation is performed on the default context.} + \apiargument{OUT}{fetch}{Local data object to be updated.} + \apiargument{OUT}{dest}{A pointer to the remotely accessible data object to + be updated.} + \apiargument{IN}{value}{The operand to the bitwise OR operation.} + \apiargument{IN}{pe}{An integer value for the \ac{PE} on which \VAR{dest} + is to be updated.} + +\end{apiarguments} + +\apidescription{ + The nonblocking \FUNC{shmem\_atomic\_fetch\_or\_nbi} routines perform an + atomic fetching bitwise OR on the remotely accessible data object pointed + by \VAR{dest} at PE \VAR{pe} with the operand \VAR{value}. This routine + returns after initiating the operation. The operation is considered complete + after a subsequent call to \FUNC{shmem\_quiet}. At the completion of + \FUNC{shmem\_quiet}, these routines have performed a fetching bitwise OR on + \VAR{dest} at PE \VAR{pe} with the operand \VAR{value} and fetched the prior + contents of \VAR{dest} into the \VAR{fetch} local data object. +} + +\apireturnvalues{ + None. +} + +\apinotes{ + None. +} + +\end{apidefinition} diff --git a/content/shmem_atomic_fetch_xor_nbi.tex b/content/shmem_atomic_fetch_xor_nbi.tex new file mode 100644 index 000000000..afec95a51 --- /dev/null +++ b/content/shmem_atomic_fetch_xor_nbi.tex @@ -0,0 +1,55 @@ +\apisummary{ + This nonblocking atomic operation performs an atomic fetching bitwise XOR + operation on a remote data object. +} + +\begin{apidefinition} + +\begin{C11synopsis} +void @\FuncDecl{shmem\_atomic\_fetch\_xor\_nbi}@(TYPE *fetch, TYPE *dest, TYPE value, int pe); +void @\FuncDecl{shmem\_atomic\_fetch\_xor\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, TYPE *dest, TYPE value, int pe); +\end{C11synopsis} +where \TYPE{} is one of the bitwise \ac{AMO} types specified by +Table~\ref{bitamotypes}. + +\begin{Csynopsis} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_atomic\_fetch\_xor\_nbi}@(TYPE *fetch, TYPE *dest, TYPE value, int pe); +void @\FuncDecl{shmem\_ctx\_\FuncParam{TYPENAME}\_atomic\_fetch\_xor\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, TYPE *dest, TYPE value, int pe); +\end{Csynopsis} +where \TYPE{} is one of the bitwise \ac{AMO} types and has a corresponding +\TYPENAME{} specified by Table~\ref{bitamotypes}. + +\begin{apiarguments} + + \apiargument{IN}{ctx}{A context handle specifying the context on which to + perform the operation. When this argument is not provided, the + operation is performed on the default context.} + \apiargument{OUT}{fetch}{Local data object to be updated.} + \apiargument{OUT}{dest}{A pointer to the remotely accessible data object to + be updated.} + \apiargument{IN}{value}{The operand to the bitwise XOR operation.} + \apiargument{IN}{pe}{An integer value for the \ac{PE} on which \VAR{dest} + is to be updated.} + +\end{apiarguments} + +\apidescription{ + The nonblocking \FUNC{shmem\_atomic\_fetch\_xor\_nbi} routines perform an + atomic fetching bitwise XOR on the remotely accessible data object pointed + by \VAR{dest} at PE \VAR{pe} with the operand \VAR{value}. This routine + returns after initiating the operation. The operation is considered complete + after a subsequent call to \FUNC{shmem\_quiet}. At the completion of + \FUNC{shmem\_quiet}, these routines have performed a fetching bitwise XOR on + \VAR{dest} at PE \VAR{pe} with the operand \VAR{value} and fetched the prior + contents of \VAR{dest} into the \VAR{fetch} local data object. +} + +\apireturnvalues{ + None. +} + +\apinotes{ + None. +} + +\end{apidefinition} diff --git a/content/shmem_atomic_swap_nbi.tex b/content/shmem_atomic_swap_nbi.tex new file mode 100644 index 000000000..0007f0ef4 --- /dev/null +++ b/content/shmem_atomic_swap_nbi.tex @@ -0,0 +1,52 @@ +\apisummary{ + This nonblocking atomic operation performs an atomic swap to a remote data + object. +} + +\begin{apidefinition} + +\begin{C11synopsis} +void @\FuncDecl{shmem\_atomic\_swap\_nbi}@(TYPE *fetch, TYPE *dest, TYPE value, int pe); +void @\FuncDecl{shmem\_atomic\_swap\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, TYPE *dest, TYPE value, int pe); +\end{C11synopsis} +where \TYPE{} is one of the extended \ac{AMO} types specified by +Table~\ref{extamotypes}. + +\begin{Csynopsis} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_atomic\_swap\_nbi}@(TYPE *fetch, TYPE *dest, TYPE value, int pe); +void @\FuncDecl{shmem\_ctx\_\FuncParam{TYPENAME}\_atomic\_swap\_nbi}@(shmem_ctx_t ctx, TYPE *fetch, TYPE *dest, TYPE value, int pe); +\end{Csynopsis} +where \TYPE{} is one of the extended \ac{AMO} types and has a corresponding +\TYPENAME{} specified by Table~\ref{extamotypes}. + +\begin{apiarguments} + \apiargument{IN}{ctx}{A context handle specifying the context on which to + perform the operation. When this argument is not provided, the + operation is performed on the default context.} + \apiargument{OUT}{fetch}{Local data object to be updated.} + \apiargument{OUT}{dest}{The remotely accessible data object to be + updated on the remote \ac{PE}.} + \apiargument{IN}{value}{The value to be atomically written to the remote + \ac{PE}.} + \apiargument{IN}{pe}{An integer that indicates the \ac{PE} number on which + \dest{} is to be updated.} +\end{apiarguments} + +\apidescription{ + The nonblocking \FUNC{shmem\_atomic\_swap\_nbi} routines perform an atomic + swap operation. This routine returns after initiating the operation. The + operation is considered complete after a subsequent call to + \FUNC{shmem\_quiet}. At the completion of \FUNC{shmem\_quiet}, it has written + \VAR{value} into \dest{} on \ac{PE} and fetched the prior contents of + \dest{} into \VAR{fetch} local data object. +} + +\apireturnvalues{ + None. +} + +\apinotes{ + None. +} + +\end{apidefinition} diff --git a/content/shmem_barrier.tex b/content/shmem_barrier.tex index a78db8c65..40405feb6 100644 --- a/content/shmem_barrier.tex +++ b/content/shmem_barrier.tex @@ -1,3 +1,4 @@ +\begin{DeprecateBlock} \apisummary{ Performs all operations described in the \FUNC{shmem\_barrier\_all} interface but with respect to a subset of \acp{PE} defined by the active set. @@ -28,7 +29,7 @@ \apidescription{ \FUNC{shmem\_barrier} is a collective synchronization routine over an - active set. Control returns from \FUNC{shmem\_barrier} after all \acp{PE} in + active set. Control returns from \FUNC{shmem\_barrier} after all \acp{PE} in the active set (specified by \VAR{PE\_start}, \VAR{logPE\_stride}, and \VAR{PE\_size}) have called \FUNC{shmem\_barrier}. @@ -46,6 +47,7 @@ The same \VAR{pSync} array may be reused on consecutive calls to \FUNC{shmem\_barrier} if the same active set is used. + } \apireturnvalues{ @@ -53,6 +55,14 @@ } \apinotes{ + As of \openshmem[1.5], \FUNC{shmem\_barrier} has been deprecated. + No team-based barrier is provided by \openshmem, as a team may have any + number of communication contexts associated with the team. + Applications seeking such an idiom should call + \FUNC{shmem\_ctx\_quiet} on the desired communication context, + followed by a call to \FUNC{shmem\_team\_sync} on the desired + team. + If the \VAR{pSync} array is initialized at the run time, all \acp{PE} must be synchronized before the first call to \FUNC{shmem\_barrier} (e.g., by \FUNC{shmem\_barrier\_all}) to ensure the array has been initialized @@ -81,3 +91,4 @@ \end{apiexamples} \end{apidefinition} +\end{DeprecateBlock} diff --git a/content/shmem_barrier_all.tex b/content/shmem_barrier_all.tex index eef23dd8b..45e698ca6 100644 --- a/content/shmem_barrier_all.tex +++ b/content/shmem_barrier_all.tex @@ -16,12 +16,13 @@ \end{apiarguments} -\apidescription{ - The \FUNC{shmem\_barrier\_all} routine registers the arrival of a \ac{PE} at - a barrier. Barriers are a mechanism for synchronizing all \acp{PE} at - once. This routine blocks the \ac{PE} until all \acp{PE} have called +\apidescription{ + The \FUNC{shmem\_barrier\_all} routine + is a mechanism for synchronizing all \acp{PE} in the default team at + once. This routine blocks the calling \ac{PE} until all \acp{PE} have called \FUNC{shmem\_barrier\_all}. In a multithreaded \openshmem - program, only the calling thread is blocked. + program, only the calling thread is blocked, however, + it may not be called concurrently by multiple threads in the same \ac{PE}. Prior to synchronizing with other \acp{PE}, \FUNC{shmem\_barrier\_all} ensures completion of all previously issued memory stores and remote memory @@ -36,6 +37,10 @@ } \apinotes{ + The \FUNC{shmem\_barrier\_all} routine is equivalent to calling + \FUNC{shmem\_ctx\_quiet} on the default context followed by + calling \FUNC{shmem\_team\_sync} on the default team. + The \FUNC{shmem\_barrier\_all} routine can be used to portably ensure that memory access operations observe remote updates in the order enforced by initiator \acp{PE}. diff --git a/content/shmem_broadcast.tex b/content/shmem_broadcast.tex index fc5a7b592..6686d2507 100644 --- a/content/shmem_broadcast.tex +++ b/content/shmem_broadcast.tex @@ -5,22 +5,49 @@ \begin{apidefinition} +%% C11 +\begin{C11synopsis} +int @\FuncDecl{shmem\_broadcast}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nelems, int PE_root); +\end{C11synopsis} +where \TYPE{} is one of the standard \ac{RMA} types specified by Table \ref{stdrmatypes}. + +%% C/C++ \begin{Csynopsis} +\end{Csynopsis} +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_broadcast}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nelems, int PE_root); +\end{CsynopsisCol} +where \TYPE{} is one of the standard \ac{RMA} types and has a corresponding \TYPENAME{} specified by Table \ref{stdrmatypes}. + +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_broadcastmem}@(shmem_team_t team, void *dest, const void *source, size_t nelems, int PE_root); +\end{CsynopsisCol} + +\begin{DeprecateBlock} +\begin{CsynopsisCol} void @\FuncDecl{shmem\_broadcast32}@(void *dest, const void *source, size_t nelems, int PE_root, int PE_start, int logPE_stride, int PE_size, long *pSync); void @\FuncDecl{shmem\_broadcast64}@(void *dest, const void *source, size_t nelems, int PE_root, int PE_start, int logPE_stride, int PE_size, long *pSync); -\end{Csynopsis} +\end{CsynopsisCol} +\end{DeprecateBlock} \begin{apiarguments} -\apiargument{OUT}{dest}{A symmetric data object.} +\apiargument{IN}{team}{The team over which to perform the operation.}% + +\apiargument{OUT}{dest}{A symmetric data object. See the table below in this description + for allowable types.} \apiargument{IN}{source}{A symmetric data object that can be of any data type that is permissible for the \dest{} argument.} -\apiargument{IN}{nelems}{The number of elements in \source. For - \FUNC{shmem\_broadcast32}, this is the number of - 32-bit halfwords. nelems must be of type \VAR{size\_t} in \Cstd.} +\apiargument{IN}{nelems}{The number of elements in \source. + nelems must be of type \VAR{size\_t} in \Cstd. When + using \Fortran, it must be a default integer value.} \apiargument{IN}{PE\_root}{Zero-based ordinal of the \ac{PE}, with respect to - the active set, from which the data is copied. Must be greater than or equal to - 0 and less than \VAR{PE\_size}. \VAR{PE\_root} must be of type integer.} + the team or active set, from which the data is copied. + \VAR{PE\_root} must be of type \CTYPE{int}. + When using \Fortran, it must be a default integer value.} + +\begin{DeprecateBlock} + \apiargument{IN}{PE\_start}{The lowest \ac{PE} number of the active set of \acp{PE}. \VAR{PE\_start} must be of type integer.} \apiargument{IN}{logPE\_stride}{ The log (base 2) of the stride between @@ -34,32 +61,53 @@ Every element of this array must be initialized with the value \CONST{SHMEM\_SYNC\_VALUE} before any of the \acp{PE} in the active set enters \FUNC{shmem\_broadcast}.} +\end{DeprecateBlock} \end{apiarguments} -\apidescription{ - \openshmem broadcast routines are collective routines. They copy data object +\apidescription{ + \openshmem broadcast routines are collective routines over an active set or + existing \openshmem team. They copy data object \source{} on the processor specified by \VAR{PE\_root} and store the values at - \dest{} on the other \acp{PE} specified by the triplet \VAR{PE\_start}, - \VAR{logPE\_stride}, \VAR{PE\_size}. The data is not copied to the \dest{} area - on the root \ac{PE}. + \dest{} on the other \acp{PE} participating in the collective operation. + The data is not copied to the \dest{} area on the root \ac{PE}. - As with all \openshmem collective routines, each of these routines assumes that - only \acp{PE} in the active set call the routine. If a \ac{PE} not in the - active set calls an \openshmem collective routine, the behavior is undefined. + The same \dest{} and \source{} data objects and the same value of \VAR{PE\_root} must be + passed by all \acp{PE} participating in the collective operation. + + Team-based broadcast routines operate over all \acp{PE} in the provided team argument. All + \acp{PE} in the provided team must participate in the operation. + If an invalid team handle or \LibConstRef{SHMEM\_TEAM\_INVALID} is passed to this routine, + the behavior is undefined. + + As with all team-based \openshmem routines, \ac{PE} + numbering is relative to the team. The specified root \ac{PE} must be a valid \ac{PE} + number for the team, between \CONST{0} and \VAR{N-1}, where \VAR{N} is + the size of the team. + Active-set-based broadcast routines operate over all \acp{PE} in the active set + defined by the \VAR{PE\_start}, \VAR{logPE\_stride}, \VAR{PE\_size} triplet. + + As with all active-set-based collective routines, + each of these routines assumes that + only \acp{PE} in the active set call the routine. If a \ac{PE} not in the + active set calls an active-set-based + collective routine, the behavior is undefined. + The values of arguments \VAR{PE\_root}, \VAR{PE\_start}, \VAR{logPE\_stride}, - and \VAR{PE\_size} must be the same value on all \acp{PE} in the active set. The same - \dest{} and \source{} data objects and the same \VAR{pSync} work array must be - passed by all \acp{PE} in the active set. + and \VAR{PE\_size} must be the same value on all \acp{PE} in the active set. + The value of \VAR{PE\_root} must be between \CONST{0} and \VAR{PE\_size}. + The same \VAR{pSync} work array must be passed by all \acp{PE} in the active set. - Before any \ac{PE} calls a broadcast routine, - the following conditions must be ensured: + Before any \ac{PE} calls a broadcast routine, the following conditions must be ensured: \begin{itemize} - \item The \VAR{pSync} array on all \acp{PE} in the active set is - not still in use from a prior call to a broadcast routine. - \item The \dest{} array on all \acp{PE} in the active set is ready - to accept the broadcast data. + \item The \dest{} array on all \acp{PE} participating in the broadcast + %% + is ready to accept the broadcast data. + \item If using active-set-based routines, the + \VAR{pSync} array on all \acp{PE} in the + active set is not still in use from a prior call to a collective + \openshmem routine. \end{itemize} Otherwise, the behavior is undefined. @@ -69,8 +117,8 @@ \item If the current \ac{PE} is not the root \ac{PE}, the \dest{} data object is updated. \item The \source{} data object may be safely reused. - \item The values in the \VAR{pSync} array are restored to the - original values. + \item If using active-set-based routines, + the values in the \VAR{pSync} array are restored to the original values. \end{itemize} } @@ -79,11 +127,12 @@ constraints, which are as follows: }{Routine}{Data type of \VAR{dest} and \VAR{source}} +\apitablerow{shmem\_broadcastmem}{\Cstd: Any data type. nelems is scaled in bytes.} \apitablerow{shmem\_broadcast64}{No \CorCpp{} structures are allowed.} \apitablerow{shmem\_broadcast32}{No \CorCpp{} structures are allowed.} \apireturnvalues{ - None. + Zero on successful local completion. Nonzero otherwise. } \apinotes{ @@ -101,13 +150,18 @@ subsequent \openshmem broadcast routine only if none of the \acp{PE} in the active set are still processing a prior \openshmem broadcast routine call that used the same \VAR{pSync} array. In general, this can be ensured only by doing - some type of synchronization. + some type of synchronization. + + Team handle error checking and integer return codes are currently undefined. + Implementations may define these behaviors as needed, but programs should + ensure portability by doing their own checks for invalid team handles and for + \LibConstRef{SHMEM\_TEAM\_INVALID}. } \begin{apiexamples} \apicexample - {In the following example, the call to \FUNC{shmem\_broadcast64} copies \source{} + {In the following \Cstd[11] example, the call to \FUNC{shmem\_broadcast} copies \source{} on \ac{PE} $0$ to \dest{} on \acp{PE} $1\dots npes-1$. \CorCpp{} example:} diff --git a/content/shmem_calloc.tex b/content/shmem_calloc.tex index a29eb2d9f..8eeb90be2 100644 --- a/content/shmem_calloc.tex +++ b/content/shmem_calloc.tex @@ -15,7 +15,8 @@ \apidescription{ - The \FUNC{shmem\_calloc} routine is a collective operation that allocates a + The \FUNC{shmem\_calloc} routine is a collective operation + on the default team that allocates a region of remotely-accessible memory for an array of \VAR{count} objects of \VAR{size} bytes each and returns a pointer to the lowest byte address of the allocated symmetric diff --git a/content/shmem_collect.tex b/content/shmem_collect.tex index 691fb023e..5f645c368 100644 --- a/content/shmem_collect.tex +++ b/content/shmem_collect.tex @@ -1,30 +1,52 @@ \apisummary{ Concatenates blocks of data from multiple \acp{PE} to an array in every - \ac{PE}. + \ac{PE} participating in the collective routine. } \begin{apidefinition} +%% C11 +\begin{C11synopsis} +int @\FuncDecl{shmem\_collect}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nelems); +int @\FuncDecl{shmem\_fcollect}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nelems); +\end{C11synopsis} +where \TYPE{} is one of the standard \ac{RMA} types specified by Table \ref{stdrmatypes}. + \begin{Csynopsis} +\end{Csynopsis} +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_collect}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nelems); +int @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_fcollect}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nelems); +\end{CsynopsisCol} +where \TYPE{} is one of the standard \ac{RMA} types and has a corresponding \TYPENAME{} specified by Table \ref{stdrmatypes}. + +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_collectmem}@(shmem_team_t team, void *dest, const void *source, size_t nelems); +int @\FuncDecl{shmem\_fcollectmem}@(shmem_team_t team, void *dest, const void *source, size_t nelems); +\end{CsynopsisCol} + +\begin{DeprecateBlock} +\begin{CsynopsisCol} void @\FuncDecl{shmem\_collect32}@(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, int PE_size, long *pSync); void @\FuncDecl{shmem\_collect64}@(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, int PE_size, long *pSync); void @\FuncDecl{shmem\_fcollect32}@(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, int PE_size, long *pSync); void @\FuncDecl{shmem\_fcollect64}@(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, int PE_size, long *pSync); -\end{Csynopsis} +\end{CsynopsisCol} +\end{DeprecateBlock} \begin{apiarguments} -\apiargument{OUT}{dest}{A symmetric array. The \dest{} argument must be large enough - to accept the concatenation of the \source{} arrays on all participating \acp{PE}. The data - types are as follows: For \FUNC{shmem\_collect64}, - and \FUNC{shmem\_fcollect64}, any data type with an - element size of 64 bits. \CorCpp{} structures are not permitted. For - \FUNC{shmem\_collect32}, and \FUNC{shmem\_fcollect32}, - any data type with an element size of \CONST{32} bits. \CorCpp{} structures are not permitted.} +\apiargument{IN}{team}{A valid \openshmem team handle.} + +\apiargument{OUT}{dest}{A symmetric array large enough + to accept the concatenation of the \source{} arrays on all participating \acp{PE}. + See table below in this description for allowable data types.} \apiargument{IN}{source}{A symmetric data object that can be of any type permissible for the \dest{} argument.} \apiargument{IN}{nelems}{The number of elements in the \source{} array. \VAR{nelems} must be of type \VAR{size\_t} for \Cstd.} + +\begin{DeprecateBlock} \apiargument{IN}{PE\_start}{The lowest \ac{PE} number of the active set of \acp{PE}. \VAR{PE\_start} must be of type integer.} \apiargument{IN}{logPE\_stride}{The log (base \CONST{2}) of the stride between @@ -38,40 +60,70 @@ Every element of this array must be initialized with the value \CONST{SHMEM\_SYNC\_VALUE} before any of the \acp{PE} in the active set enter \FUNC{shmem\_collect} or \FUNC{shmem\_fcollect}.} +\end{DeprecateBlock} \end{apiarguments} \apidescription{ - \openshmem \FUNC{collect} and \FUNC{fcollect} routines concatenate \VAR{nelems} - \CONST{64}-bit or \CONST{32}-bit data items from the \source{} array into the - \dest{} array, over the set of \acp{PE} defined by \VAR{PE\_start}, - \VAR{log2PE\_stride}, and \VAR{PE\_size}, in processor number order. The - resultant \dest{} array contains the contribution from \ac{PE} \VAR{PE\_start} - first, then the contribution from \ac{PE} \VAR{PE\_start} + \VAR{PE\_stride} - second, and so on. The collected result is written to the \dest{} array for all - \acp{PE} in the active set. - + \openshmem \FUNC{collect} and \FUNC{fcollect} routines perform a collective + operation to concatenate \VAR{nelems} + data items from the \source{} array into the + \dest{} array, over an \openshmem team or active set + in processor number order. The resultant \dest{} array contains the contribution from + \acp{PE} as follows: + + \begin{itemize} + \item For an active set, the data from \ac{PE} \VAR{PE\_start} is first, then the + contribution from \ac{PE} \VAR{PE\_start} + \VAR{PE\_stride} second, and so on. + \item For a team, the data from \ac{PE} number \CONST{0} in the team is first, then the + contribution from \ac{PE} \CONST{1} in the team, and so on. + \end{itemize} + + The collected result is written to the \dest{} array for all \acp{PE} + that participate in the operation. The same \dest{} and \source{} + arrays must be passed by all \acp{PE} that participate in the operation. + The \FUNC{fcollect} routines require that \VAR{nelems} be the same value in all participating \acp{PE}, while the \FUNC{collect} routines allow \VAR{nelems} to vary from \ac{PE} to \ac{PE}. - As with all \openshmem collective routines, each of these routines assumes that + Team-based collect routines operate over all \acp{PE} in the provided team argument. All + \acp{PE} in the provided team must participate in the operation. + + Active-set-based collective routines operate over all \acp{PE} in the active set + defined by the \VAR{PE\_start}, \VAR{logPE\_stride}, \VAR{PE\_size} triplet. + As with all active-set-based collective routines, + each of these routines assumes that only \acp{PE} in the active set call the routine. If a \ac{PE} not in the active set and calls this collective routine, the behavior is undefined. - + The values of arguments \VAR{PE\_start}, \VAR{logPE\_stride}, and \VAR{PE\_size} - must be the same value on all \acp{PE} in the active set. The same \dest{} and \source{} - arrays and the same \VAR{pSync} work array must be passed by all \acp{PE} in the - active set. - + must be the same value on all \acp{PE} in the active set. The same + \VAR{pSync} work array must be passed by all \acp{PE} in the active set. + Upon return from a collective routine, the following are true for the local - \ac{PE}: The \dest{} array is updated and the \source{} array may be safely reused. - The values in the \VAR{pSync} array are + \ac{PE}: + \begin{itemize} + \item The \dest{} array is updated and the \source{} array may be safely reused. + \item For active-set-based collective routines, the values in the \VAR{pSync} array are restored to the original values. + \end{itemize} } +\apidesctable{ +The \dest{} and \source{} data objects must conform to certain typing +constraints, which are as follows: +}{Routine}{Data type of \VAR{dest} and \VAR{source}} +\apitablerow{\FUNC{shmem\_collectmem}, \FUNC{shmem\_fcollectmem}}{\Cstd: Any data type. \VAR{nelems} is scaled in bytes.}% +\apitablerow{\FUNC{shmem\_collect64}, \FUNC{shmem\_fcollect64}}% + {Any noncharacter type that has an element size of \CONST{64} bits. No \Fortran derived types nor + \CorCpp{} structures are allowed.} +\apitablerow{\FUNC{shmem\_collect32}, \FUNC{shmem\_fcollect32}}% + {Any noncharacter type that has an element size of \CONST{32} bits. No \Fortran derived types nor + \CorCpp{} structures are allowed.} + \apireturnvalues{ - None. + Zero on successful local completion. Nonzero otherwise. } \apinotes{ diff --git a/content/shmem_ctx_create.tex b/content/shmem_ctx_create.tex index 4ef3c485d..a6298a921 100644 --- a/content/shmem_ctx_create.tex +++ b/content/shmem_ctx_create.tex @@ -1,5 +1,5 @@ \apisummary{ - Create a communication context. + Create a communication context locally. } \begin{apidefinition} @@ -28,6 +28,13 @@ in a correct state. The creation call can be reattempted with different options or after additional resources become available. + A newly created communication context has a fixed association with the + default team. + All \openshmem routines that operate on this context will do so with + respect to the associated \ac{PE} team. + That is, all point-to-point routines operating on this context will use + team-relative \ac{PE} numbering. + By default, contexts are {\em shareable} and, when it is allowed by the threading model provided by the \openshmem library, they can be used concurrently by multiple threads within the PE where they were created. diff --git a/content/shmem_ctx_destroy.tex b/content/shmem_ctx_destroy.tex index 01e93c854..2e801685a 100644 --- a/content/shmem_ctx_destroy.tex +++ b/content/shmem_ctx_destroy.tex @@ -14,7 +14,8 @@ \apidescription{ \FUNC{shmem\_ctx\_destroy} destroys a context that was created by a call to - \FUNC{shmem\_ctx\_create}. It is the user's responsibility to ensure that + \FUNC{shmem\_ctx\_create} or \FUNC{shmem\_team\_create\_ctx}. + It is the user's responsibility to ensure that the context is not used after it has been destroyed, for example when the destroyed context is used by multiple threads. This function performs an implicit quiet operation on the given context before it is freed. @@ -27,8 +28,6 @@ } \apinotes{ - It is invalid to pass \CONST{SHMEM\_CTX\_DEFAULT} to this routine. - Destroying a context makes it impossible for the user to complete communication operations that are pending on that context. This includes nonblocking communication operations, whose local buffers are only returned diff --git a/content/shmem_ctx_get_team.tex b/content/shmem_ctx_get_team.tex new file mode 100644 index 000000000..5fe1ea998 --- /dev/null +++ b/content/shmem_ctx_get_team.tex @@ -0,0 +1,55 @@ +\apisummary{ + Retrieve the team associated with the communication context. +} + +\begin{apidefinition} + +\begin{Csynopsis} +int @\FuncDecl{shmem\_ctx\_get\_team}@(shmem_ctx_t ctx, shmem_team_t *team); +\end{Csynopsis} + +\begin{apiarguments} + \apiargument{IN}{ctx}{ + A handle to a communication context. + } + \apiargument{OUT}{team}{ + A pointer to a handle to the associated \ac{PE} team. + } +\end{apiarguments} + +\apidescription{ + The \FUNC{shmem\_ctx\_get\_team} routine returns a handle to the + team associated with the specified communication context \VAR{ctx}. + The team handle is returned through the pointer argument \VAR{team}. + + If \VAR{ctx} is the default context or one created by a call to + \FUNC{shmem\_ctx\_create}, the returned team is the default team. + + When \VAR{ctx} is an invalid context, if \VAR{ctx} compares equal to + \LibConstRef{SHMEM\_CTX\_INVALID}, then \VAR{team} is assigned the + value \LibConstRef{SHMEM\_TEAM\_INVALID} and a nonzero value is + returned; otherwise, the behavior is undefined. + + If \VAR{team} is a null pointer, the behavior is undefined. +} + +\apireturnvalues{ + Zero on success; otherwise, nonzero. +} + +\apinotes{ + None. +} + +\begin{apiexamples} + + \apicexample + {The following example demonstrates the use of contexts for multiple teams in a + \Cstd[11] program. This example shows contexts being used to communicate within + a team using team \ac{PE} numbers, and across teams using translated \ac{PE} numbers.} + {./example_code/shmem_team_context.c} + {} + +\end{apiexamples} + +\end{apidefinition} diff --git a/content/shmem_fence.tex b/content/shmem_fence.tex index f02fd8d7d..7762d8c3f 100644 --- a/content/shmem_fence.tex +++ b/content/shmem_fence.tex @@ -1,6 +1,8 @@ \apisummary{ - Assures ordering of delivery of \PUT{}, \ac{AMO}, memory store, and nonblocking \PUT{} routines - to symmetric data objects. + Assures ordering of delivery of memory store, blocking \PUT{}, + \ac{AMO}, and \OPR{put-with-signal}, as well as nonblocking + \PUT{}, \OPR{put-with-signal}, and \ac{AMO} + routines to symmetric data objects. } \begin{apidefinition} @@ -17,14 +19,21 @@ \end{apiarguments} \apidescription{ - This routine assures ordering of delivery of \PUT{}, \ac{AMO}, memory store, and nonblocking \PUT{} - routines to symmetric data objects. All \PUT{}, \ac{AMO}, memory store, and nonblocking \PUT{} + This routine assures ordering of delivery of memory store, blocking \PUT{}, + \ac{AMO}, and \OPR{put-with-signal}, as well as nonblocking \PUT{}, + \OPR{put-with-signal}, and \ac{AMO} + routines to symmetric data objects. All memory store, blocking \PUT{}, + \ac{AMO}, and \OPR{put-with-signal}, as well as nonblocking \PUT{}, + \OPR{put-with-signal}, and \ac{AMO} routines to symmetric data objects issued to a particular remote \ac{PE} on the given context prior to the call to \FUNC{shmem\_fence} are guaranteed to be delivered before any - subsequent \PUT{}, \ac{AMO}, memory store, and nonblocking \PUT{} routines to symmetric data + subsequent memory store, blocking \PUT{}, \ac{AMO}, and \OPR{put-with-signal}, + as well as nonblocking \PUT{}, \OPR{put-with-signal}, and \ac{AMO} + routines to symmetric data objects to the same \ac{PE}. \FUNC{shmem\_fence} guarantees order of delivery, - not completion. It does not guarantee order of delivery of nonblocking \GET{} routines. + not completion. It does not guarantee order of delivery of nonblocking + \GET{} or values fetched by nonblocking \ac{AMO} routines. If \VAR{ctx} has the value \CONST{SHMEM\_CTX\_INVALID}, no operation is performed. } @@ -42,17 +51,19 @@ ordering of its memory accesses. There is a subtle difference between \FUNC{shmem\_fence} and \FUNC{shmem\_quiet}, in that, \FUNC{shmem\_quiet} - guarantees completion of \PUT{}, \ac{AMO}, memory store, and nonblocking \PUT{} routines to + guarantees completion of memory store, blocking \PUT{}, \ac{AMO}, and + \OPR{put-with-signal}, as well as nonblocking \PUT{}, \OPR{put-with-signal}, and \ac{AMO} routines to symmetric data objects which makes the updates visible to all other \acp{PE}. - The \FUNC{shmem\_quiet} routine should be called if completion of \PUT{}, - \ac{AMO}, memory store, and nonblocking \PUT{} routines to symmetric data objects is desired + The \FUNC{shmem\_quiet} routine should be called if completion of memory + store, blocking \PUT{}, \ac{AMO}, and \OPR{put-with-signal}, as well as + nonblocking \PUT{}, \OPR{put-with-signal}, and \ac{AMO} routines to symmetric data objects is desired when multiple remote \acp{PE} are involved. In an \openshmem program with multithreaded \acp{PE}, it is the user's responsibility to ensure ordering between operations issued by the threads - in a \ac{PE} that target symmetric memory (e.g. \PUT{}, \ac{AMO}, memory stores, + in a \ac{PE} that target symmetric memory (e.g. \PUT{}, \ac{AMO}, \OPR{put-with-signal}, memory stores, and nonblocking routines) and calls by threads in that \ac{PE} to \FUNC{shmem\_fence}. The \FUNC{shmem\_fence} routine can enforce memory store ordering only for the calling thread. Thus, to ensure ordering for memory stores performed by a thread that is diff --git a/content/shmem_finalize.tex b/content/shmem_finalize.tex index ec01f6583..cfa32d137 100644 --- a/content/shmem_finalize.tex +++ b/content/shmem_finalize.tex @@ -22,7 +22,9 @@ implicit global barrier in \FUNC{shmem\_finalize} to ensure that pending communications are completed and that no resources are released until all \acp{PE} have entered \FUNC{shmem\_finalize}. - This routine destroys all shareable contexts. The user is + This routine destroys all teams created by the \openshmem program. + As a result, all shareable contexts are destroyed. + The user is responsible for destroying all contexts with the \CONST{SHMEM\_CTX\_PRIVATE} option enabled prior to calling this routine; otherwise, the behavior is undefined. diff --git a/content/shmem_get_nbi.tex b/content/shmem_get_nbi.tex index aa7a119c1..37d6e5ec2 100644 --- a/content/shmem_get_nbi.tex +++ b/content/shmem_get_nbi.tex @@ -45,7 +45,7 @@ \apidescription{ The get routines provide a method for copying a contiguous symmetric data object from a different \ac{PE} to a contiguous data object on the local - \ac{PE}. The routines return after posting the operation. The operation is considered + \ac{PE}. The routines return after initiating the operation. The operation is considered complete after a subsequent call to \FUNC{shmem\_quiet}. At the completion of \FUNC{shmem\_quiet}, the data has been delivered to the \dest{} array on the local \ac{PE}. diff --git a/content/shmem_malloc.tex b/content/shmem_malloc.tex index c7fe958a1..44b4854de 100644 --- a/content/shmem_malloc.tex +++ b/content/shmem_malloc.tex @@ -23,7 +23,7 @@ \apidescription{ The \FUNC{shmem\_malloc}, \FUNC{shmem\_free}, \FUNC{shmem\_realloc}, and \FUNC{shmem\_align} routines are collective operations that require - participation by all \acp{PE}. + participation by all \acp{PE} in the default team. The \FUNC{shmem\_malloc} routine returns a pointer to a block of at least \VAR{size} bytes, which shall be suitably aligned so that it may be diff --git a/content/shmem_ptr.tex b/content/shmem_ptr.tex index cd5e726c3..2eefc9a5c 100644 --- a/content/shmem_ptr.tex +++ b/content/shmem_ptr.tex @@ -39,7 +39,9 @@ \begin{apiexamples} \apicexample - {This is the equivalent program written in \Cstd[11]:} + {In the following \Cstd[11] example, \ac{PE} 0 uses the \FUNC{shmem\_ptr} + routine to query a pointer and directly access the \VAR{dest} array on + \ac{PE} 1:} {./example_code/shmem_ptr_example.c} {} diff --git a/content/shmem_put_nbi.tex b/content/shmem_put_nbi.tex index c706920f9..a74924992 100644 --- a/content/shmem_put_nbi.tex +++ b/content/shmem_put_nbi.tex @@ -42,7 +42,7 @@ \end{apiarguments} \apidescription{ - The routines return after posting the operation. The operation is considered + The routines return after initiating the operation. The operation is considered complete after a subsequent call to \FUNC{shmem\_quiet}. At the completion of \FUNC{shmem\_quiet}, the data has been copied into the \dest{} array on the destination \ac{PE}. diff --git a/content/shmem_put_signal.tex b/content/shmem_put_signal.tex new file mode 100644 index 000000000..0cc3cb512 --- /dev/null +++ b/content/shmem_put_signal.tex @@ -0,0 +1,103 @@ +\apisummary{ + The \OPR{put-with-signal} routines provide a method for copying data from a + contiguous local data object to a data object on a specified \ac{PE} + and subsequently updating a remote flag to signal completion. +} + +\begin{apidefinition} + +\begin{C11synopsis} +void @\FuncDecl{shmem\_put\_signal}@(TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +void @\FuncDecl{shmem\_put\_signal}@(shmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +\end{C11synopsis} +where \TYPE{} is one of the standard \ac{RMA} types specified by Table \ref{stdrmatypes}. + +\begin{Csynopsis} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_put\_signal}@(TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +void @\FuncDecl{shmem\_ctx\_\FuncParam{TYPENAME}\_put\_signal}@(shmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +\end{Csynopsis} +where \TYPE{} is one of the standard \ac{RMA} types and has a corresponding \TYPENAME{} specified by Table \ref{stdrmatypes}. + +\begin{CsynopsisCol} +void @\FuncDecl{shmem\_put\FuncParam{SIZE}\_signal}@(void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +void @\FuncDecl{shmem\_ctx\_put\FuncParam{SIZE}\_signal}@(shmem_ctx_t ctx, void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +\end{CsynopsisCol} +where \SIZE{} is one of \CONST{8, 16, 32, 64, 128}. + +\begin{CsynopsisCol} +void @\FuncDecl{shmem\_putmem\_signal}@(void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +void @\FuncDecl{shmem\_ctx\_putmem\_signal}@(shmem_ctx_t ctx, void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +\end{CsynopsisCol} + +\begin{apiarguments} + \apiargument{IN}{ctx}{A context handle specifying the context on which to + perform the operation. When this argument is not provided, the operation is + performed on the default context.} + \apiargument{OUT}{dest}{Data object to be updated on the remote \ac{PE}. + This data object must be remotely accessible.} + \apiargument{IN}{source}{Data object containing the data to be copied.} + \apiargument{IN}{nelems}{Number of elements in the \dest{} and \source{} + arrays. \VAR{nelems} must be of type \VAR{size\_t} for \Cstd.} + \apiargument{OUT}{sig\_addr}{signal data object to be updated on the remote + \ac{PE} as a signal. This signal data object must be remotely accessible.} + \apiargument{IN}{signal}{Unsigned 64-bit value that is used for updating the + remote \VAR{sig\_addr} signal data object.} + \apiargument{IN}{sig\_op}{Signal operator that represents the type of update + to be performed on the remote \VAR{sig\_addr} signal data object.} + \apiargument{IN}{pe}{\ac{PE} number of the remote \ac{PE}.} +\end{apiarguments} + +\apidescription{ + The \OPR{put-with-signal} routines provide a method for copying data from a + contiguous local data object to a data object on a specified \ac{PE} + and subsequently updating a remote flag to signal completion. The routines + return after the data has been copied out of the \source{} array on the + local \ac{PE}. + + The \VAR{sig\_op} signal operator determines the type of update to be + performed on the remote \VAR{sig\_addr} signal data object. The completion + of signal update based on the \VAR{sig\_op} signal operator using the + \VAR{signal} flag on the remote \ac{PE} indicates the delivery of its + corresponding \dest{} data words into the data object on the remote \ac{PE}. + + An update to the \VAR{sig\_addr} signal data object through a + \OPR{put-with-signal} routine completes as if performed atomically as described in + Section~\ref{subsec:signal_atomicity}. The various options as described in + Section~\ref{subsec:signal_operator} can be used as the \VAR{sig\_op} signal + operator. +} + +\apireturnvalues{ + None. +} + +\apinotes{ + The \dest{} and \VAR{sig\_addr} data objects must both be remotely + accessible. The \VAR{sig\_addr} and \dest{} could be of different kinds, + for example, one could be a global/static \Cstd variable and the other could + be allocated on the symmetric heap. + + \VAR{sig\_addr} and \dest{} may not be overlapping in memory. + + The completion of signal update using the \VAR{signal} flag on the remote + \ac{PE} indicates only the delivery of its corresponding \dest{} data words + into the data object on the remote \ac{PE}. Without a memory-ordering + operation, there is no implied ordering between the signal update of a + \OPR{put-with-signal} routine and another data transfer. For example, the + completion of the signal update in a sequence consisting of a put routine + followed by a \OPR{put-with-signal} routine does not imply delivery of the put + routine's data. +} + +\begin{apiexamples} + +\apicexample + {The following example demonstrates the usage of \FUNC{shmem\_put\_signal}. + It shows the implementation of a broadcast operation from \ac{PE} 0 to + itself and all other \acp{PE} in the job as a simple ring-based algorithm + using \FUNC{shmem\_put\_signal}:} + {./example_code/shmem_put_signal_example.c} + {} +\end{apiexamples} + +\end{apidefinition} diff --git a/content/shmem_put_signal_nbi.tex b/content/shmem_put_signal_nbi.tex new file mode 100644 index 000000000..15c185bc7 --- /dev/null +++ b/content/shmem_put_signal_nbi.tex @@ -0,0 +1,91 @@ +\apisummary{ + The nonblocking \OPR{put-with-signal} routines provide a method for copying data + from a contiguous local data object to a data object on a specified \ac{PE} + and subsequently updating a remote flag to signal completion. +} + +\begin{apidefinition} + +\begin{C11synopsis} +void @\FuncDecl{shmem\_put\_signal\_nbi}@(TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +void @\FuncDecl{shmem\_put\_signal\_nbi}@(shmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +\end{C11synopsis} +where \TYPE{} is one of the standard \ac{RMA} types specified by Table \ref{stdrmatypes}. + +\begin{Csynopsis} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_put\_signal\_nbi}@(TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +void @\FuncDecl{shmem\_ctx\_\FuncParam{TYPENAME}\_put\_signal\_nbi}@(shmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +\end{Csynopsis} +where \TYPE{} is one of the standard \ac{RMA} types and has a corresponding \TYPENAME{} specified by Table \ref{stdrmatypes}. + +\begin{CsynopsisCol} +void @\FuncDecl{shmem\_put\FuncParam{SIZE}\_signal\_nbi}@(void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +void @\FuncDecl{shmem\_ctx\_put\FuncParam{SIZE}\_signal\_nbi}@(shmem_ctx_t ctx, void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +\end{CsynopsisCol} +where \SIZE{} is one of \CONST{8, 16, 32, 64, 128}. + +\begin{CsynopsisCol} +void @\FuncDecl{shmem\_putmem\_signal\_nbi}@(void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +void @\FuncDecl{shmem\_ctx\_putmem\_signal\_nbi}@(shmem_ctx_t ctx, void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +\end{CsynopsisCol} + +\begin{apiarguments} + \apiargument{IN}{ctx}{A context handle specifying the context on which to + perform the operation. When this argument is not provided, the operation is + performed on the default context.} + \apiargument{OUT}{dest}{Data object to be updated on the remote \ac{PE}. + This data object must be remotely accessible.} + \apiargument{IN}{source}{Data object containing the data to be copied.} + \apiargument{IN}{nelems}{Number of elements in the \dest{} and \source{} + arrays. \VAR{nelems} must be of type \VAR{size\_t} for \Cstd.} + \apiargument{OUT}{sig\_addr}{Data object to be updated on the remote + \ac{PE} as the signal. This signal data object must be remotely accessible.} + \apiargument{IN}{signal}{Unsigned 64-bit value that is used for updating the + remote \VAR{sig\_addr} signal data object.} + \apiargument{IN}{sig\_op}{Signal operator that represents the type of update + to be performed on the remote \VAR{sig\_addr} signal data object.} + \apiargument{IN}{pe}{\ac{PE} number of the remote \ac{PE}.} +\end{apiarguments} + +\apidescription{ + The nonblocking \OPR{put-with-signal} routines provide a method for copying data + from a contiguous local data object to a data object on a specified \ac{PE} + and subsequently updating a remote flag to signal completion. + + The routines return after initiating the operation. The operation is considered + complete after a subsequent call to \FUNC{shmem\_quiet}. At the completion + of \FUNC{shmem\_quiet}, the data has been copied out of the \source{} array + on the local \ac{PE} and delivered into the \dest{} array on the destination + \ac{PE}. + + The delivery of \VAR{signal} flag on the remote \ac{PE} indicates only the + delivery of its corresponding \dest{} data words into the data object on the + remote \ac{PE}. Furthermore, two successive nonblocking \OPR{put-with-signal} + routines, or a nonblocking \OPR{put-with-signal} routine with another data + transfer may deliver data out of order unless a call to \FUNC{shmem\_fence} + is introduced between the two calls. + + The \VAR{sig\_op} signal operator determines the type of update to be + performed on the remote \VAR{sig\_addr} signal data object. + + An update to the \VAR{sig\_addr} signal data object through a nonblocking + \OPR{put-with-signal} routine completes as if performed atomically as described in + Section~\ref{subsec:signal_atomicity}. The various options as described in + Section~\ref{subsec:signal_operator} can be used as the \VAR{sig\_op} signal + operator. +} + +\apireturnvalues{ + None. +} + +\apinotes{ + The \dest{} and \VAR{sig\_addr} data objects must both be remotely + accessible. The \VAR{sig\_addr} and \dest{} could be of different kinds, + for example, one could be a global/static \Cstd variable and the other could + be allocated on the symmetric heap. + + \VAR{sig\_addr} and \dest{} may not be overlapping in memory. +} + +\end{apidefinition} diff --git a/content/shmem_query_interoperability.tex b/content/shmem_query_interoperability.tex new file mode 100644 index 000000000..a656f2497 --- /dev/null +++ b/content/shmem_query_interoperability.tex @@ -0,0 +1,47 @@ +\apisummary{ + Determines whether an interoperability feature is supported by the \openshmem + library implementation. +} +\begin{apidefinition} + +\begin{Csynopsis} +int @\FuncDecl{shmem\_query\_interoperability}@(int property); +\end{Csynopsis} + +\begin{apiarguments} + \apiargument{IN}{property}{The interoperability property queried by the user.} +\end{apiarguments} + +% compiling error ? +% \apidescription{ +\FUNC{shmem\_query\_interoperability} queries whether an interoperability property +is supported by the \openshmem library. One of the following properties can be +queried in an \openshmem program after finishing the +initialization call to \openshmem and that of the relevant programming models +being used in the program. An \openshmem library implementation may extend the +available properties. + +\begin{itemize} +\item \VAR{SHMEM\_PROGRESS\_MPI} Query whether the \openshmem +implementation makes progress for the MPI communication used in the user program. +\end{itemize} +% } + +\apireturnvalues{ + The return value is \CONST{1} if \VAR{property} is supported by the \openshmem library; + otherwise, it is \CONST{0}. +} + +\begin{apiexamples} + +\apicexample + {The following example queries whether the \openshmem library supports asynchronous +progress for MPI. If it returns 1, the library guarantees the MPI nonblocking send +is processed while PE 0 is in the busy wait loop with repeated calls to +\FUNC{shmem\_int\_atomic\_fetch} so that deadlock will not occur.} + {./example_code/shmem_query_mpi_progress.c} + {} + +\end{apiexamples} + +\end{apidefinition} diff --git a/content/shmem_quiet.tex b/content/shmem_quiet.tex index c89a3ea31..e0fb5e8c2 100644 --- a/content/shmem_quiet.tex +++ b/content/shmem_quiet.tex @@ -1,7 +1,8 @@ \apisummary{ - Waits for completion of all outstanding \PUT{}, \ac{AMO}, memory store, - and nonblocking \PUT{} and \GET{} routines to symmetric data - objects issued by a \ac{PE}. + Waits for completion of all outstanding memory store, blocking + \PUT{}, \ac{AMO}, and \OPR{put-with-signal}, as well as + nonblocking \PUT{}, \OPR{put-with-signal}, \GET{}, and \ac{AMO} routines + to symmetric data objects issued by a \ac{PE}. } \begin{apidefinition} @@ -18,10 +19,12 @@ \end{apiarguments} \apidescription{ - The \FUNC{shmem\_quiet} routine ensures completion of \PUT{}, \ac{AMO}, - memory store, and nonblocking \PUT{} and \GET{} routines on - symmetric data objects issued by the calling \ac{PE} on the given context. All \PUT{}, \ac{AMO}, - memory store, and nonblocking \PUT{} and \GET{} routines to + The \FUNC{shmem\_quiet} routine ensures completion of memory store, blocking + \PUT{}, \ac{AMO}, and + \OPR{put-with-signal}, as well as nonblocking \PUT{},\OPR{put-with-signal}, \GET{}, and \ac{AMO} routines on + symmetric data objects issued by the calling \ac{PE} on the given context. + All memory store, blocking \PUT{}, \ac{AMO}, and \OPR{put-with-signal}, as + well as nonblocking \PUT{}, \OPR{put-with-signal}, \GET{}, and \ac{AMO} routines to symmetric data objects are guaranteed to be completed and visible to all \acp{PE} when \FUNC{shmem\_quiet} returns. If \VAR{ctx} has the value \CONST{SHMEM\_CTX\_INVALID}, no operation is @@ -35,20 +38,22 @@ \apinotes{ \FUNC{shmem\_quiet} is most useful as a way of ensuring completion of - several \PUT{}, \ac{AMO}, memory store, and nonblocking \PUT{} - and \GET{} routines to symmetric data objects initiated by the calling + several memory store, blocking \PUT{}, \ac{AMO}, and \OPR{put-with-signal}, + as well as nonblocking \PUT{}, + \OPR{put-with-signal}, \GET{}, and \ac{AMO} routines to symmetric data objects initiated by the calling \ac{PE}. For example, one might use \FUNC{shmem\_quiet} to await delivery of a block of data before issuing another \PUT{} or nonblocking \PUT{} routine, which sets a completion flag on another \ac{PE}. \FUNC{shmem\_quiet} is not usually needed if \FUNC{shmem\_barrier\_all} or \FUNC{shmem\_barrier} are called. The barrier - routines wait for the completion of outstanding writes (\PUT{}, \ac{AMO}, - memory stores, and nonblocking \PUT{} and \GET{} routines) to + routines wait for the completion of outstanding writes (memory store, + blocking \PUT{}, \ac{AMO}, and \OPR{put-with-signal}, as well as nonblocking \PUT{}, + \OPR{put-with-signal}, \GET{}, and \ac{AMO} routines) to symmetric data objects on all \acp{PE}. In an \openshmem program with multithreaded \acp{PE}, it is the user's responsibility to ensure ordering between operations issued by the threads - in a \ac{PE} that target symmetric memory (e.g. \PUT{}, \ac{AMO}, memory stores, + in a \ac{PE} that target symmetric memory (e.g. \PUT{}, \ac{AMO}, \OPR{put-with-signal}, memory stores, and nonblocking routines) and calls by threads in that \ac{PE} to \FUNC{shmem\_quiet}. The \FUNC{shmem\_quiet} routine can enforce memory store ordering only for the calling thread. Thus, to ensure ordering for memory stores performed by a thread that is diff --git a/content/shmem_reductions.tex b/content/shmem_reductions.tex index 77326d52a..348257f73 100644 --- a/content/shmem_reductions.tex +++ b/content/shmem_reductions.tex @@ -5,87 +5,202 @@ \begin{apidefinition} + +\begin{table}[h] + \begin{center} + \begin{tabular}{|l|l|l|l|l|} + \hline + \TYPE & \TYPENAME & \multicolumn{3}{c|}{Operations Supporting \TYPE}\\ \hline + unsigned char & uchar & AND, OR, XOR & & \\ \hline + short & short & AND, OR, XOR & MAX, MIN & SUM, PROD \\ \hline + unsigned short & ushort & AND, OR, XOR & MAX, MIN & SUM, PROD \\ \hline + int & int & AND, OR, XOR & MAX, MIN & SUM, PROD \\ \hline + unsigned int & uint & AND, OR, XOR & MAX, MIN & SUM, PROD \\ \hline + long & long & AND, OR, XOR & MAX, MIN & SUM, PROD \\ \hline + unsigned long & ulong & AND, OR, XOR & MAX, MIN & SUM, PROD \\ \hline + long long & longlong & AND, OR, XOR & MAX, MIN & SUM, PROD \\ \hline + unsigned long long & ulonglong & AND, OR, XOR & MAX, MIN & SUM, PROD \\ \hline + float & float & & MAX, MIN & SUM, PROD \\ \hline + double & double & & MAX, MIN & SUM, PROD \\ \hline + long double & longdouble & & MAX, MIN & SUM, PROD \\ \hline + double \_Complex & complexd & & & SUM, PROD \\ \hline + float \_Complex & complexf & & & SUM, PROD \\ \hline + \end{tabular} + \TableCaptionRef{Reduction Types, Names and Supporting Operations} + \label{reducetypes} + \end{center} +\end{table} + + \paragraph{AND} Performs a bitwise AND reduction across a set of \acp{PE}.\newline + +%% C11 +\begin{C11synopsis} +int @\FuncDecl{shmem\_and\_reduce}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce); +\end{C11synopsis} +where \TYPE{} is one of the integer types supported for the AND operation as specified by Table \ref{reducetypes}. + +%% C/C++ +\begin{Csynopsis} +\end{Csynopsis} +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_and\_reduce}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce); +\end{CsynopsisCol} + +\begin{DeprecateBlock} +\begin{CsynopsisCol} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_and\_to\_all}@(TYPE *dest, const TYPE *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +\end{CsynopsisCol} +\end{DeprecateBlock} +where \TYPE{} is one of the integer types supported for the AND operation and has a corresponding \TYPENAME{} as specified by Table \ref{reducetypes}. + +\paragraph{OR} +Performs a bitwise OR reduction across a set of \acp{PE}.\newline + +%% C11 +\begin{C11synopsis} +int @\FuncDecl{shmem\_or\_reduce}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce); +\end{C11synopsis} +where \TYPE{} is one of the integer types supported for the OR operation as specified by Table \ref{reducetypes}. + +%% C/C++ +\begin{Csynopsis} +\end{Csynopsis} +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_or\_reduce}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce); +\end{CsynopsisCol} + +\begin{DeprecateBlock} +\begin{CsynopsisCol} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_or\_to\_all}@(TYPE *dest, const TYPE *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +\end{CsynopsisCol} +\end{DeprecateBlock} +where \TYPE{} is one of the integer types supported for the OR operation and has a corresponding \TYPENAME{} as specified by Table \ref{reducetypes}. + +\paragraph{XOR} +Performs a bitwise exclusive OR (XOR) reduction across a set of \acp{PE}.\newline + +%% C11 +\begin{C11synopsis} +int @\FuncDecl{shmem\_xor\_reduce}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce); +\end{C11synopsis} +where \TYPE{} is one of the integer types supported for the XOR operation as specified by Table \ref{reducetypes}. + +%% C/C++ \begin{Csynopsis} -void @\FuncDecl{shmem\_short\_and\_to\_all}@(short *dest, const short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); -void @\FuncDecl{shmem\_int\_and\_to\_all}@(int *dest, const int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); -void @\FuncDecl{shmem\_long\_and\_to\_all}@(long *dest, const long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); -void @\FuncDecl{shmem\_longlong\_and\_to\_all}@(long long *dest, const long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); \end{Csynopsis} +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_xor\_reduce}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce); +\end{CsynopsisCol} + +\begin{DeprecateBlock} +\begin{CsynopsisCol} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_xor\_to\_all}@(TYPE *dest, const TYPE *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +\end{CsynopsisCol} +\end{DeprecateBlock} +where \TYPE{} is one of the integer types supported for the XOR operation and has a corresponding \TYPENAME{} as specified by Table \ref{reducetypes}. \paragraph{MAX} Performs a maximum-value reduction across a set of \acp{PE}.\newline + +%% C11 +\begin{C11synopsis} +int @\FuncDecl{shmem\_max\_reduce}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce); +\end{C11synopsis} +where \TYPE{} is one of the integer or real types supported for the MAX operation as specified by Table \ref{reducetypes}. + +%% C/C++ \begin{Csynopsis} -void @\FuncDecl{shmem\_short\_max\_to\_all}@(short *dest, const short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); -void @\FuncDecl{shmem\_int\_max\_to\_all}@(int *dest, const int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); -void @\FuncDecl{shmem\_double\_max\_to\_all}@(double *dest, const double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, double *pWrk, long *pSync); -void @\FuncDecl{shmem\_float\_max\_to\_all}@(float *dest, const float *source, int nreduce, int PE_start, int logPE_stride, int PE_size, float *pWrk, long *pSync); -void @\FuncDecl{shmem\_long\_max\_to\_all}@(long *dest, const long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); -void @\FuncDecl{shmem\_longdouble\_max\_to\_all}@(long double *dest, const long double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long double *pWrk, long *pSync); -void @\FuncDecl{shmem\_longlong\_max\_to\_all}@(long long *dest, const long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); \end{Csynopsis} +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_max\_reduce}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce); +\end{CsynopsisCol} + +\begin{DeprecateBlock} +\begin{CsynopsisCol} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_max\_to\_all}@(TYPE *dest, const TYPE *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +\end{CsynopsisCol} +\end{DeprecateBlock} +where \TYPE{} is one of the integer or real types supported for the MAX operation and has a corresponding \TYPENAME{} as specified by Table \ref{reducetypes}. + \paragraph{MIN} Performs a minimum-value reduction across a set of \acp{PE}.\newline + +%% C11 +\begin{C11synopsis} +int @\FuncDecl{shmem\_min\_reduce}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce); +\end{C11synopsis} +where \TYPE{} is one of the integer or real types supported for the MIN operation as specified by Table \ref{reducetypes}. + +%% C/C++ \begin{Csynopsis} -void @\FuncDecl{shmem\_short\_min\_to\_all}@(short *dest, const short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); -void @\FuncDecl{shmem\_int\_min\_to\_all}@(int *dest, const int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); -void @\FuncDecl{shmem\_double\_min\_to\_all}@(double *dest, const double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, double *pWrk, long *pSync); -void @\FuncDecl{shmem\_float\_min\_to\_all}@(float *dest, const float *source, int nreduce, int PE_start, int logPE_stride, int PE_size, float *pWrk, long *pSync); -void @\FuncDecl{shmem\_long\_min\_to\_all}@(long *dest, const long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); -void @\FuncDecl{shmem\_longdouble\_min\_to\_all}@(long double *dest, const long double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long double *pWrk, long *pSync); -void @\FuncDecl{shmem\_longlong\_min\_to\_all}@(long long *dest, const long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); \end{Csynopsis} +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_min\_reduce}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce); +\end{CsynopsisCol} + +\begin{DeprecateBlock} +\begin{CsynopsisCol} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_min\_to\_all}@(TYPE *dest, const TYPE *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +\end{CsynopsisCol} +\end{DeprecateBlock} +where \TYPE{} is one of the integer or real types supported for the MIN operation and has a corresponding \TYPENAME{} as specified by Table \ref{reducetypes}. + \paragraph{SUM} Performs a sum reduction across a set of \acp{PE}.\newline + +%% C11 +\begin{C11synopsis} +int @\FuncDecl{shmem\_sum\_reduce}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce); +\end{C11synopsis} +where \TYPE{} is one of the integer, real, or complex types supported for the SUM operation as specified by Table \ref{reducetypes}. + +%% C/C++ \begin{Csynopsis} -void @\FuncDecl{shmem\_complexd\_sum\_to\_all}@(double _Complex *dest, const double _Complex *source, int nreduce, int PE_start, int logPE_stride, int PE_size, double _Complex *pWrk, long *pSync); -void @\FuncDecl{shmem\_complexf\_sum\_to\_all}@(float _Complex *dest, const float _Complex *source, int nreduce, int PE_start, int logPE_stride, int PE_size, float _Complex *pWrk, long *pSync); -void @\FuncDecl{shmem\_short\_sum\_to\_all}@(short *dest, const short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); -void @\FuncDecl{shmem\_int\_sum\_to\_all}@(int *dest, const int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); -void @\FuncDecl{shmem\_double\_sum\_to\_all}@(double *dest, const double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, double *pWrk, long *pSync); -void @\FuncDecl{shmem\_float\_sum\_to\_all}@(float *dest, const float *source, int nreduce, int PE_start, int logPE_stride, int PE_size, float *pWrk, long *pSync); -void @\FuncDecl{shmem\_long\_sum\_to\_all}@(long *dest, const long *source, int nreduce, int PE_start, int logPE_stride,int PE_size, long *pWrk, long *pSync); -void @\FuncDecl{shmem\_longdouble\_sum\_to\_all}@(long double *dest, const long double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long double *pWrk, long *pSync); -void @\FuncDecl{shmem\_longlong\_sum\_to\_all}@(long long *dest, const long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); \end{Csynopsis} +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_sum\_reduce}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce); +\end{CsynopsisCol} + +\begin{DeprecateBlock} +\begin{CsynopsisCol} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_sum\_to\_all}@(TYPE *dest, const TYPE *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +\end{CsynopsisCol} +\end{DeprecateBlock} +where \TYPE{} is one of the integer, real, or complex types supported for the SUM operation and has a corresponding \TYPENAME{} as specified by Table \ref{reducetypes}. + \paragraph{PROD} Performs a product reduction across a set of \acp{PE}.\newline -\begin{Csynopsis} -void @\FuncDecl{shmem\_complexd\_prod\_to\_all}@(double _Complex *dest, const double _Complex *source, int nreduce, int PE_start, int logPE_stride, int PE_size, double _Complex *pWrk, long *pSync); -void @\FuncDecl{shmem\_complexf\_prod\_to\_all}@(float _Complex *dest, const float _Complex *source, int nreduce, int PE_start, int logPE_stride, int PE_size, float _Complex *pWrk, long *pSync); -void @\FuncDecl{shmem\_short\_prod\_to\_all}@(short *dest, const short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); -void @\FuncDecl{shmem\_int\_prod\_to\_all}@(int *dest, const int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); -void @\FuncDecl{shmem\_double\_prod\_to\_all}@(double *dest, const double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, double *pWrk, long *pSync); -void @\FuncDecl{shmem\_float\_prod\_to\_all}@(float *dest, const float *source, int nreduce, int PE_start, int logPE_stride, int PE_size, float *pWrk, long *pSync); -void @\FuncDecl{shmem\_long\_prod\_to\_all}@(long *dest, const long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); -void @\FuncDecl{shmem\_longdouble\_prod\_to\_all}@(long double *dest, const long double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long double *pWrk, long *pSync); -void @\FuncDecl{shmem\_longlong\_prod\_to\_all}@(long long *dest, const long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); -\end{Csynopsis} -\paragraph{OR} -Performs a bitwise OR reduction across a set of \acp{PE}.\newline -\begin{Csynopsis} -void @\FuncDecl{shmem\_short\_or\_to\_all}@(short *dest, const short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); -void @\FuncDecl{shmem\_int\_or\_to\_all}@(int *dest, const int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); -void @\FuncDecl{shmem\_long\_or\_to\_all}@(long *dest, const long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); -void @\FuncDecl{shmem\_longlong\_or\_to\_all}@(long long *dest, const long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); -\end{Csynopsis} +%% C11 +\begin{C11synopsis} +int @\FuncDecl{shmem\_prod\_reduce}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce); +\end{C11synopsis} +where \TYPE{} is one of the integer, real, or complex types supported for the PROD operation as specified by Table \ref{reducetypes}. -\paragraph{XOR} -Performs a bitwise exclusive OR (XOR) reduction across a set of \acp{PE}.\newline +%% C/C++ \begin{Csynopsis} -void @\FuncDecl{shmem\_short\_xor\_to\_all}@(short *dest, const short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); -void @\FuncDecl{shmem\_int\_xor\_to\_all}@(int *dest, const int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); -void @\FuncDecl{shmem\_long\_xor\_to\_all}@(long *dest, const long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); -void @\FuncDecl{shmem\_longlong\_xor\_to\_all}@(long long *dest, const long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); \end{Csynopsis} +\begin{CsynopsisCol} +int @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_prod\_reduce}@(shmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce); +\end{CsynopsisCol} + +\begin{DeprecateBlock} +\begin{CsynopsisCol} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_prod\_to\_all}@(TYPE *dest, const TYPE *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +\end{CsynopsisCol} +\end{DeprecateBlock} +where \TYPE{} is one of the integer, real, or complex types supported for the PROD operation and has a corresponding \TYPENAME{} as specified by Table \ref{reducetypes}. + \begin{apiarguments} +\apiargument{IN}{team}{The team over which to perform the operation.}% + \apiargument{OUT}{dest}{A symmetric array, of length \VAR{nreduce} elements, to receive the result of the reduction routines. The data type of \dest{} varies with the version of the reduction routine being called. When calling from @@ -94,7 +209,11 @@ contains one element for each separate reduction routine. The \source{} argument must have the same data type as \dest.} \apiargument{IN}{nreduce}{The number of elements in the \dest{} and \source{} - arrays. \VAR{nreduce} must be of type integer.} + arrays. In teams based API calls, \VAR{nreduce} must be of type size\_t. + In deprecated active-set based API calls, + \VAR{nreduce} must be of type integer.} + +\begin{DeprecateBlock} \apiargument{IN}{PE\_start}{The lowest \ac{PE} number of the active set of \acp{PE}. \VAR{PE\_start} must be of type integer.} \apiargument{IN}{logPE\_stride}{The log (base 2) of the stride between consecutive @@ -111,48 +230,61 @@ Every element of this array must be initialized with the value \CONST{SHMEM\_SYNC\_VALUE} before any of the \acp{PE} in the active set enter the reduction routine.} +\end{DeprecateBlock} \end{apiarguments} \apidescription{ - \openshmem reduction routines compute one or more reductions across symmetric + \openshmem reduction routines are collective routines over an active set or + existing \openshmem team that compute one or more reductions across symmetric arrays on multiple \acp{PE}. A reduction performs an associative binary routine across a set of values. The \VAR{nreduce} argument determines the number of separate reductions to - perform. The \source{} array on all \acp{PE} in the active set provides one - element for each reduction. The results of the reductions are placed in the - \dest{} array on all \acp{PE} in the active set. The active set is defined - by the \VAR{PE\_start}, \VAR{logPE\_stride}, \VAR{PE\_size} triplet. - + perform. The \source{} array on all \acp{PE} participating in the reduction + provides one element for each reduction. The results of the reductions are placed in the + \dest{} array on all \acp{PE} participating in the reduction. + The \source{} and \dest{} arrays may be the same array, but they may not be - overlapping arrays. + overlapping arrays. The same \dest{} and \source{} arrays + must be passed to all \acp{PE} participating in the reduction. + + Team-based reduction routines operate over all \acp{PE} in the provided team argument. All + \acp{PE} in the provided team must participate in the reduction. If an invalid team handle + or \LibConstRef{SHMEM\_TEAM\_INVALID} is passed to this routine, the behavior is undefined. + + Active-set-based sync routines operate over all \acp{PE} in the active set + defined by the \VAR{PE\_start}, \VAR{logPE\_stride}, \VAR{PE\_size} triplet. - As with all \openshmem collective routines, each of these routines assumes + As with all active set-based collective routines, + each of these routines assumes that only \acp{PE} in the active set call the routine. If a \ac{PE} not in - the active set calls an \openshmem collective routine, the behavior is undefined. + the active set calls an active set-based collective routine, + the behavior is undefined. - The values of arguments \VAR{nreduce}, \VAR{PE\_start}, \VAR{logPE\_stride}, and - \VAR{PE\_size} must be equal on all \acp{PE} in the active set. The same \dest{} - and \source{} arrays, and the same \VAR{pWrk} and \VAR{pSync} work arrays, must - be passed to all \acp{PE} in the active set. + The values of arguments \VAR{nreduce}, \VAR{PE\_start}, \VAR{logPE\_stride}, + and \VAR{PE\_size} must be equal on all \acp{PE} in the active set. + The same \VAR{pWrk} and \VAR{pSync} work arrays must be passed to all + \acp{PE} in the active set. - Before any \ac{PE} calls a reduction routine, - the following conditions must be ensured: + Before any \ac{PE} calls a reduction routine, the following conditions must be ensured: \begin{itemize} - \item The \VAR{pWrk} and \VAR{pSync} arrays on all \acp{PE} in the + \item The \dest{} array on all \acp{PE} participating in the reduction + is ready to accept the results of the \OPR{reduction}. + \item If using active-set-based routines, the + \VAR{pWrk} and \VAR{pSync} arrays on all \acp{PE} in the active set are not still in use from a prior call to a collective \openshmem routine. - \item The \dest{} array on all \acp{PE} in the active set is ready - to accept the results of the \OPR{reduction}. \end{itemize} Otherwise, the behavior is undefined. - + Upon return from a reduction routine, the following are true for the local - \ac{PE}: The \dest{} array is updated and the \source{} array may be safely reused. - The values in the \VAR{pSync} array are - restored to the original values. - + \ac{PE}: + \begin{itemize} + \item The \dest{} array is updated and the \source{} array may be safely reused. + \item If using active-set-based routines, + the values in the \VAR{pSync} array are restored to the original values. + \end{itemize} The complex-typed interfaces are only provided for sum and product reductions. When the \Cstd translation environment does not support complex types @@ -163,7 +295,7 @@ } \apireturnvalues{ - None. + Zero on successful local completion. Nonzero otherwise. } \apinotes{ @@ -182,4 +314,16 @@ by doing some type of synchronization. } +\begin{apiexamples} + +\apifexample + {This \CorCpp reduction example gets integers from an external + source (random genererator in this example), tests to see if the \ac{PE} got a valid + value, and outputs the sum of values for which all \acp{PE} got a valid + value.} + {./example_code/shmem_reduce_example.c} + {} + +\end{apiexamples} + \end{apidefinition} diff --git a/content/shmem_signal_fetch.tex b/content/shmem_signal_fetch.tex new file mode 100644 index 000000000..35094245e --- /dev/null +++ b/content/shmem_signal_fetch.tex @@ -0,0 +1,31 @@ +\apisummary{ + Fetches the signal update on a local data object. +} + +\begin{apidefinition} + +\begin{Csynopsis} +uint64_t @\FuncDecl{shmem\_signal\_fetch}@(const uint64_t *sig_addr); +\end{Csynopsis} + +\begin{apiarguments} + \apiargument{IN}{sig\_addr}{A pointer to a remotely accessible variable.} +\end{apiarguments} + +\apidescription{ + \FUNC{shmem\_signal\_fetch} performs a fetch operation and returns the + contents of the \VAR{sig\_addr} signal data object. Access to + \VAR{sig\_addr} signal object at the calling \ac{PE} is expected to satisfy + the atomicity guarantees as described in Section~\ref{subsec:signal_atomicity}. +} + +\apireturnvalues{ + Returns the contents of the signal data object, \VAR{sig\_addr}, at the + calling \ac{PE}. +} + +\apinotes{ + None. +} + +\end{apidefinition} diff --git a/content/shmem_signal_wait_until.tex b/content/shmem_signal_wait_until.tex new file mode 100644 index 000000000..35a1a14e9 --- /dev/null +++ b/content/shmem_signal_wait_until.tex @@ -0,0 +1,55 @@ +\apisummary{ + Wait for a variable on the local \ac{PE} to change from a signaling + operation. +} + +\begin{apidefinition} + +\begin{Csynopsis} +uint64_t @\FuncDecl{shmem\_signal\_wait\_until}@(uint64_t *sig_addr, int cmp, uint64_t cmp_value); +\end{Csynopsis} + +\begin{apiarguments} + +\apiargument{IN}{sig\_addr}{A pointer to a remotely accessible variable.} +\apiargument{IN}{cmp}{The comparison operator that compares \VAR{sig\_addr} with + \VAR{cmp\_value}.} +\apiargument{IN}{cmp\_value}{The value against which the object pointed to + by \VAR{sig\_addr} will be compared.} + +\end{apiarguments} + +\apidescription{ + \FUNC{shmem\_signal\_wait\_until} operation blocks until the value contained + in the signal data object, \VAR{sig\_addr}, at the calling \ac{PE} satisfies + the wait condition. In an \openshmem program with single-threaded or + multithreaded \acp{PE}, the \VAR{sig\_addr} object at the calling \ac{PE} is + expected only to be updated as a signal, through the signaling operations + available in Section~\ref{subsec:shmem_put_signal} and + Section~\ref{subsec:shmem_put_signal_nbi}. + + This routine can be used to implement point-to-point synchronization between + \acp{PE} or between threads within the same \ac{PE}. A call to this routine + blocks until the value of \VAR{sig\_addr} at the calling \ac{PE} satisfies + the wait condition specified by the comparison operator, \VAR{cmp}, and + comparison value, \VAR{cmp\_value}. +} + +\apireturnvalues{ + Return the contents of the signal data object, \VAR{sig\_addr}, at the + calling \ac{PE} that satisfies the wait condition. +} + + +\apinotes{ + None. +} + +\apiimpnotes{ + Implementations must ensure that \FUNC{shmem\_signal\_wait\_until} do not + return before the update of the memory indicated by \VAR{sig\_addr} is fully + complete. Partial updates to the memory must not cause + \FUNC{shmem\_signal\_wait\_until} to return. +} + +\end{apidefinition} diff --git a/content/shmem_sync.tex b/content/shmem_sync.tex index 2c5707929..ceacc2115 100644 --- a/content/shmem_sync.tex +++ b/content/shmem_sync.tex @@ -1,16 +1,31 @@ \apisummary{ - Performs all operations described in the \FUNC{shmem\_sync\_all} interface - but with respect to a subset of \acp{PE} defined by the active set. + Registers the arrival of a \ac{PE} at a synchronization point and suspends + execution until all other \acp{PE} in a given \openshmem team or active set + arrive at a synchronization point. For multithreaded programs, execution is suspended + as specified by the threading model (Section \ref{subsec:thread_support}). } \begin{apidefinition} +\begin{C11synopsis} +int @\FuncDecl{shmem\_sync}@(shmem_team_t team); +\end{C11synopsis} + \begin{Csynopsis} -void @\FuncDecl{shmem\_sync}@(int PE_start, int logPE_stride, int PE_size, long *pSync); +int @\FuncDecl{shmem\_team\_sync}@(shmem_team_t team); \end{Csynopsis} +\begin{DeprecateBlock} +\begin{CsynopsisCol} +void @\FuncDecl{shmem\_sync}@(int PE_start, int logPE_stride, int PE_size, long *pSync); +\end{CsynopsisCol} +\end{DeprecateBlock} + \begin{apiarguments} +\apiargument{IN}{team}{The team over which to perform the operation.}% + +\begin{DeprecateBlock} \apiargument{IN}{PE\_start}{The lowest \ac{PE} number of the active set of \acp{PE}. \VAR{PE\_start} must be of type integer.} \apiargument{IN}{logPE\_stride}{The log (base 2) of the stride between @@ -22,18 +37,33 @@ of type \CTYPE{long} and size \CONST{SHMEM\_BARRIER\_SYNC\_SIZE}. Every element of this array must be initialized to \CONST{SHMEM\_SYNC\_VALUE} before any of the \acp{PE} in the active set enter \FUNC{shmem\_sync} the first time.} +\end{DeprecateBlock} \end{apiarguments} \apidescription{ \FUNC{shmem\_sync} is a collective synchronization routine over an - active set. Control returns from \FUNC{shmem\_sync} after all \acp{PE} in - the active set (specified by \VAR{PE\_start}, \VAR{logPE\_stride}, and - \VAR{PE\_size}) have called \FUNC{shmem\_sync}. + existing \openshmem team or active set. + + The routine registers the arrival of a \ac{PE} at a synchronization point in the program. + This is a fast mechanism for synchronizing all \acp{PE} that participate in this + collective call. The routine blocks the calling \ac{PE} until all \acp{PE} in the + specified team or active set have called \FUNC{shmem\_sync}. In a multithreaded \openshmem + program, only the calling thread is blocked. + + Team-based sync routines operate over all \acp{PE} in the provided team argument. All + \acp{PE} in the provided team must participate in the sync operation. + If an invalid team handle or \LibConstRef{SHMEM\_TEAM\_INVALID} + is passed to this routine, the behavior is undefined. + + Active-set-based sync routines operate over all \acp{PE} in the active set + defined by the \VAR{PE\_start}, \VAR{logPE\_stride}, \VAR{PE\_size} triplet. - As with all \openshmem collective routines, each of these routines assumes + As with all active set-based collective routines, + each of these routines assumes that only \acp{PE} in the active set call the routine. If a \ac{PE} not in - the active set calls an \openshmem collective routine, the behavior is undefined. + the active set calls an active set-based collective routine, + the behavior is undefined. The values of arguments \VAR{PE\_start}, \VAR{logPE\_stride}, and \VAR{PE\_size} must be equal on all \acp{PE} in the active set. The same @@ -48,7 +78,7 @@ } \apireturnvalues{ - None. + Zero on successful local completion. Nonzero otherwise. } \apinotes{ diff --git a/content/shmem_sync_all.tex b/content/shmem_sync_all.tex index 8d6c95244..5c70a966e 100644 --- a/content/shmem_sync_all.tex +++ b/content/shmem_sync_all.tex @@ -1,6 +1,7 @@ \apisummary{ - Registers the arrival of a \ac{PE} at a barrier and suspends \ac{PE} - execution until all other \acp{PE} arrive at the barrier. + Registers the arrival of a \ac{PE} at a synchronization point and suspends + execution until all other \acp{PE} in the default team arrive at a synchronization point. For multithreaded programs, execution is suspended + as specified by the threading model (Section \ref{subsec:thread_support}). } \begin{apidefinition} @@ -16,11 +17,12 @@ \end{apiarguments} \apidescription{ - The \FUNC{shmem\_sync\_all} routine registers the arrival of a \ac{PE} at a - barrier. Barriers are a fast mechanism for synchronizing all \acp{PE} at - once. This routine blocks the \ac{PE} until all \acp{PE} have called - \FUNC{shmem\_sync\_all}. In a multithreaded \openshmem - program, only the calling thread is blocked. + + This routine blocks the calling \ac{PE} until all \acp{PE} in the + default team have called \FUNC{shmem\_sync\_all}. + + In a multithreaded \openshmem program, only the calling thread is + blocked. In contrast with the \FUNC{shmem\_barrier\_all} routine, \FUNC{shmem\_sync\_all} only ensures completion and visibility of previously issued memory @@ -33,11 +35,8 @@ } \apinotes{ - The \FUNC{shmem\_sync\_all} routine can be used to portably ensure that - memory access operations observe remote updates in the order enforced by the - initiator \acp{PE}, provided that the initiator PE ensures completion of remote - updates with a call to \FUNC{shmem\_quiet} prior to the call to the - \FUNC{shmem\_sync\_all} routine. + The \FUNC{shmem\_sync\_all} routine is equivalent to calling + \FUNC{shmem\_team\_sync} on the default team. } \end{apidefinition} diff --git a/content/shmem_team_config_t.tex b/content/shmem_team_config_t.tex new file mode 100644 index 000000000..720f9e278 --- /dev/null +++ b/content/shmem_team_config_t.tex @@ -0,0 +1,71 @@ +\apisummary{ + A structure type representing team configuration arguments +} + +\begin{apidefinition} + +\begin{Csynopsis} +typedef struct { + int num_contexts; +} shmem_team_config_t; +\end{Csynopsis} + +\begin{apiarguments} + None. +\end{apiarguments} + + +\apidescription{ + A team configuration argument acts as an input + \FUNC{shmem\_team\_split\_*} routines. + It specifies the requested capabilities of the team to be + created. + + The \VAR{num\_contexts} member specifies the total number of simultaneously + existing contexts that the program requests to create from this team. + These contexts may be created in any number of threads. Successful + creation of a team configured with \VAR{num\_contexts} of $N$ means + that the implementation will make a best effort to reserve enough + resources to allow the team to have $N$ contexts created from the team + in existance at any given time. It is not a guaruntee that $N$ + calls to \FUNC{shmem\_team\_create\_ctx} will succeed. + See Section~\ref{sec:ctx} for more on communication contexts and + Section~\ref{subsec:shmem_team_create_ctx} for team-based context creation. + + When using the configuration structure to create teams, a mask parameter + controls which fields to use. + Any configuration parameter value that is not indicated in the mask will be + ignored, and the default value will be used instead. + Therefore, a program does not have to set all fields in the config struct; + only those for which it does not want the default values. + + A configuration mask value is created by combining individual field + masks with through a bitwise OR operation of the following library constants: + + { + \apitablerow{\LibConstRef{SHMEM\_TEAM\_NUM\_CONTEXTS}}{ + The team should be created using the value of the + \VAR{num\_contexts} member of the configuration parameter + \VAR{config} as a requirement. + } + } + + A configuration mask value of \CONST{0} indicates that the team + should be created with the default values for all configuration + parameters. + + The default values for configuration parameters are: + + { + \apitablerow{num\_contexts = \CONST{0}}{ + By default, no contexts can be created on a new team + } + } + +} + +\apinotes{ + None. +} + +\end{apidefinition} diff --git a/content/shmem_team_create_ctx.tex b/content/shmem_team_create_ctx.tex new file mode 100644 index 000000000..dce63be92 --- /dev/null +++ b/content/shmem_team_create_ctx.tex @@ -0,0 +1,63 @@ +\apisummary{ + Create a communication context from a team locally. +} + +\begin{apidefinition} + +\begin{Csynopsis} +int @\FuncDecl{shmem\_team\_create\_ctx}@(shmem_team_t team, long options, shmem_ctx_t *ctx); +\end{Csynopsis} + +\begin{apiarguments} + \apiargument{IN}{team}{A handle to the specified \ac{PE} team.} + \apiargument{IN}{options}{ + The set of options requested for the given context. + Multiple options may be requested by combining them with a bitwise OR + operation; otherwise, \CONST{0} can be given if no options are requested.} + \apiargument{OUT}{ctx}{A handle to the newly created context.} +\end{apiarguments} + +\apidescription{ + The \FUNC{shmem\_team\_create\_ctx} routine creates a new communication + context and returns its handle through the \VAR{ctx} argument. + This context is created from the team specified by the \VAR{team} argument. + + In addition to the team, the \FUNC{shmem\_team\_create\_ctx} routine accepts + the same arguments and provides all the same return conditions as the + \FUNC{shmem\_ctx\_create} routine. + + The \FUNC{shmem\_team\_create\_ctx} routine may be called any number of times + to create multiple simultaneously existing contexts for the team. Programs + should request the total number of simultaneous contexts to be created from + the team during team creation. See Section~\ref{subsec:shmem_team_config_t} + for more information on how to request contexts during team creation. + + A call to \FUNC{shmem\_team\_create\_ctx} on a team may fail, regardless + of the configuration request for contexts, if the implementation is unable + to create a context at the time when \FUNC{shmem\_team\_create\_ctx} is + called. + + All explicitly created resources associated with a team must be destroyed + before the \FUNC{shmem\_team\_destroy} routine is called. If a context + returned from \FUNC{shmem\_team\_create\_ctx} is not explicitly + destroyed before the team is destroyed, behavior is undefined. + + All \openshmem routines that operate on this context will do so with + respect to the associated \ac{PE} team. + That is, all point-to-point routines operating on this context will use + team-relative \ac{PE} numbering. +} + +\apireturnvalues{ + Zero on success and nonzero otherwise. +} + +\apinotes{ + None. +} + +\begin{apiexamples} + See example in Section \ref{subsec:shmem_ctx_get_team} +\end{apiexamples} + +\end{apidefinition} diff --git a/content/shmem_team_destroy.tex b/content/shmem_team_destroy.tex new file mode 100644 index 000000000..00df39af8 --- /dev/null +++ b/content/shmem_team_destroy.tex @@ -0,0 +1,42 @@ +\apisummary{ + Destroy an existing team. +} + +\begin{apidefinition} + +\begin{Csynopsis} +void @\FuncDecl{shmem\_team\_destroy}@(shmem_team_t team); +\end{Csynopsis} + +\begin{apiarguments} +\apiargument{IN}{team}{An \openshmem team handle.} +\end{apiarguments} + +\apidescription{ + +The \FUNC{shmem\_team\_destroy} routine is a collective operation that +destroys the team referenced by the team handle argument \VAR{team}. +Upon return, the referenced team is invalid. + +This routine destroys all shareable contexts created from the +referenced team. The user is responsible for destroying all contexts +created from this team with the \CONST{SHMEM\_CTX\_PRIVATE} option +enabled prior to calling this routine; otherwise, the behavior is +undefined. + +It is an error to free the default team or any other predefined team. + +When \VAR{team} specifies an invalid team, if \VAR{team} compares +equal to \LibConstRef{SHMEM\_TEAM\_INVALID}, then no operation is +performed; otherwise, the behavior is undefined. +} + +\apireturnvalues{ + None. +} + +\apinotes{ + None. +} + +\end{apidefinition} diff --git a/content/shmem_team_get_config.tex b/content/shmem_team_get_config.tex new file mode 100644 index 000000000..f06c0c832 --- /dev/null +++ b/content/shmem_team_get_config.tex @@ -0,0 +1,38 @@ +\apisummary{ + Return the configuration parameters of a given team +} + +\begin{apidefinition} + +\begin{Csynopsis} +int @\FuncDecl{shmem\_team\_get\_config}@(shmem_team_t team, shmem_team_config_t *config); +\end{Csynopsis} + +\begin{apiarguments} + \apiargument{IN}{team}{An \openshmem team handle.} + \apiargument{OUT}{config}{ + A pointer to the configuration parameters for the given team.} +\end{apiarguments} + +\apidescription{ +\FUNC{shmem\_team\_get\_config} returns through the \VAR{config} argument +the configuration parameters of the given team, which were assigned according +to input configuration parameters when the team was created. + +When \VAR{team} specifies an invalid team, if \VAR{team} compares +equal to \LibConstRef{SHMEM\_TEAM\_INVALID}, then no operation is +performed; otherwise, the behavior is undefined. +} + +\apireturnvalues{ + If \VAR{team} does not compare equal to + \LibConstRef{SHMEM\_TEAM\_INVALID}, then + \FUNC{shmem\_team\_get\_config} returns \CONST{0}; otherwise, + returns nonzero. +} + +\apinotes{ + None. +} + +\end{apidefinition} diff --git a/content/shmem_team_my_pe.tex b/content/shmem_team_my_pe.tex new file mode 100644 index 000000000..74a216e7a --- /dev/null +++ b/content/shmem_team_my_pe.tex @@ -0,0 +1,38 @@ +\apisummary{ + Returns the number of the calling \ac{PE} within a specified team. +} + +\begin{apidefinition} + +\begin{Csynopsis} +int @\FuncDecl{shmem\_team\_my\_pe}@(shmem_team_t team); +\end{Csynopsis} + +\begin{apiarguments} +\apiargument{IN}{team}{An \openshmem team handle.} +\end{apiarguments} + +\apidescription{ + When \VAR{team} specifies a valid team, the + \FUNC{shmem\_team\_my\_pe} routine returns the number of the calling + \ac{PE} within the specified team. + The number is an integer between $0$ and $N-1$ for a team of size $N$. + Each member of the team has a unique number. + + When \VAR{team} specifies an invalid team, if \VAR{team} compares + equal to \LibConstRef{SHMEM\_TEAM\_INVALID}, then the value + \CONST{-1} is returned; otherwise, the behavior is undefined. +} + +\apireturnvalues{ + The number of the calling \ac{PE} within the specified team, or the + value \CONST{-1} if the team handle compares equal to + \LibConstRef{SHMEM\_TEAM\_INVALID}. +} + +\apinotes{ + For the default team, this routine will return the same value as + \FUNC{shmem\_my\_pe}. +} + +\end{apidefinition} diff --git a/content/shmem_team_n_pes.tex b/content/shmem_team_n_pes.tex new file mode 100644 index 000000000..20a99edf9 --- /dev/null +++ b/content/shmem_team_n_pes.tex @@ -0,0 +1,38 @@ +\apisummary{ + Returns the number of \acp{PE} in a specified team. +} + +\begin{apidefinition} + +\begin{Csynopsis} +int @\FuncDecl{shmem\_team\_n\_pes}@(shmem_team_t team); +\end{Csynopsis} + +\begin{apiarguments} +\apiargument{IN}{team}{An \openshmem team handle.} +\end{apiarguments} + +\apidescription{ + When \VAR{team} specifies a valid team, the + \FUNC{shmem\_team\_n\_pes} routine returns the number of \acp{PE} in + the team. + This will always be a value between $1$ and $N$, where $N$ is the + total number of \acp{PE} running in the \openshmem program. + + When \VAR{team} specifies an invalid team, if \VAR{team} compares + equal to \LibConstRef{SHMEM\_TEAM\_INVALID}, then the value + \CONST{-1} is returned; otherwise, the behavior is undefined. +} + +\apireturnvalues{ + The number of \acp{PE} in the specified team, or the value + \CONST{-1} if the team handle compares equal to + \LibConstRef{SHMEM\_TEAM\_INVALID}. +} + +\apinotes{ + For the default team, this routine will return the same value as + \FUNC{shmem\_n\_pes}. +} + +\end{apidefinition} diff --git a/content/shmem_team_split_2d.tex b/content/shmem_team_split_2d.tex new file mode 100644 index 000000000..1e75ad3d0 --- /dev/null +++ b/content/shmem_team_split_2d.tex @@ -0,0 +1,245 @@ +\apisummary{ +Create two new teams by splitting an existing parent team into two subsets based on a +2D Cartesian space defined by the \VAR{xrange} argument and a \VAR{y} dimension derived from \VAR{xrange} +and the parent team size. These ranges describe the Cartesian space in \emph{x}- +and \emph{y}-dimensions.} + +\begin{apidefinition} + +\begin{Csynopsis} +int @\FuncDecl{shmem\_team\_split\_2d}@(shmem_team_t parent_team, int xrange, + const shmem_team_config_t *xaxis_config, long xaxis_mask, shmem_team_t *xaxis_team, + const shmem_team_config_t *yaxis_config, long yaxis_mask, shmem_team_t *yaxis_team); +\end{Csynopsis} + +\begin{apiarguments} +\apiargument{IN}{parent\_team}{A valid \openshmem team. Any predefined teams, such as +\LibHandleRef{SHMEM\_TEAM\_WORLD}, may be used, or any team created by the user.} + +\apiargument{IN}{xrange}{A nonnegative integer representing the number of +elements in the first dimension.} + +\apiargument{IN}{xaxis\_config}{ + A pointer to the configuration parameters for the new \VAR{x}-axis team.} + +\apiargument{IN}{xaxis\_mask}{ + The bitwise mask representing the set of configuration parameters to use + from \VAR{xaxis\_config}.} + +\apiargument{OUT}{xaxis\_team}{A new \ac{PE} team handle representing a \ac{PE} +subset consisting of all the \acp{PE} that have the same coordinate along the \VAR{x}-axis +as the calling \ac{PE}.} + +\apiargument{IN}{yaxis\_config}{ + A pointer to the configuration parameters for the new \VAR{y}-axis team.} + +\apiargument{IN}{yaxis\_mask}{ + The bitwise mask representing the set of configuration parameters to use + from \VAR{yaxis\_config}.} + +\apiargument{OUT}{yaxis\_team}{A new \ac{PE} team handle representing a \ac{PE} +subset consisting of all the \acp{PE} that have the same coordinate along the \VAR{y}-axis +as the calling \ac{PE}.} +\end{apiarguments} + +\apidescription{ +The \FUNC{shmem\_team\_split\_2d} routine is a collective routine. It creates two +new teams by splitting an existing parent team into up to two subsets based on a +2D Cartesian space. The user provides the size of the \VAR{x} dimension, which is then +used to derive the size of the \VAR{y} dimension based on the size of the parent team. +The size of the \VAR{y} dimension will be equal to $\lceil N \div xrange \rceil$, where +\VAR{N} is the size of the parent team. In other words, +$xrange \times yrange \geq N$, so that every \ac{PE} in the parent team has a +unique \VAR{(x,y)} location the 2D Cartesian space. + +The mapping of \ac{PE} number to coordinates is $(x, y) = ( pe \mod xrange, \lfloor pe \div xrange \rfloor )$, +where $pe$ is the \ac{PE} number in the parent team. So, if $xrange = 3$, +then the first 3 \acp{PE} in the parent team will form the first +\VAR{xteam}, the second three \acp{PE} in the parent team form the second \VAR{xteam}, +and so on. + +Thus, after the split operation, each of the new \VAR{xteam}s will contain all \acp{PE} that +have the same coordinate along the \VAR{y}-axis as the calling \ac{PE}. Each of the +new \VAR{yteam}s will contain all \acp{PE} with the same coordinate along the +\VAR{x}-axis as the calling \ac{PE}. + +The \acp{PE} are numbered in the new teams based on the coordinate of the +\ac{PE} along the given axis. So, another way to think of the result of the split +operation is that the value returned by \FUNC{shmem\_team\_my\_pe(\VAR{xteam})} is the +x-coordinate and the value returned by \FUNC{shmem\_team\_my\_pe(\VAR{yteam})} +is the y-coordinate of the calling \ac{PE}. + +Any valid \openshmem team can be used as the parent team. This routine must be +called by all \acp{PE} in the parent team. The value of \VAR{xrange} must be +nonnegative and all \acp{PE} in the parent team must pass the same value for +\VAR{xrange}. + +The \VAR{xaxis\_config} and \VAR{yaxis\_config} arguments specify team +configuration parameters for the \VAR{x}- and \VAR{y}-axis teams, respectively. +These parameters are described in Section~\ref{subsec:shmem_team_config_t}. +All \acp{PE} that will be in the same resultant team must specify the same +configuration parameters. +The \acp{PE} in the parent team \emph{do not} have to all provide the same +parameters for new teams. + +The \VAR{xaxis\_mask} and \VAR{yaxis\_mask} arguments are a bitwise masks +representing the set of configuration parameters to use from +\VAR{xaxis\_config} and \VAR{yaxis\_config}, respectively. +A mask value of \CONST{0} indicates that the team +should be created with the default values for all configuration parameters. +See Section~\ref{subsec:shmem_team_config_t} for field mask names and +default configuration parameters. + +If \VAR{parent\_team} is an invalid team handle, the behavior is undefined. + +If \VAR{parent\_team} compares equal to \LibConstRef{SHMEM\_TEAM\_INVALID}, no new +teams will be created, and both \VAR{xaxis\_team} and \VAR{yaxis\_team} +will be assigned the value \LibConstRef{SHMEM\_TEAM\_INVALID}. + +If either team cannot be created, that team will be assigned the value +\LibConstRef{SHMEM\_TEAM\_INVALID}. +} + +\apireturnvalues{ + Zero on successful creation of both \VAR{xaxis\_team} and \VAR{yaxis\_team}, + nonzero otherwise. +} + +\apinotes{ +Since the split may result in a 2D space with more points than there are members of +the parent team, there may be a final, incomplete row of the 2D mapping of the parent +team. This means that the resultant \VAR{yteam}s may vary in size by up to 1 \ac{PE}, +and that there may be one resultant \VAR{xteam} of smaller size than all of the other +\VAR{xteam}s. + +The following grid shows the 12 teams that would result from splitting a parent team +of size 10 with \VAR{xrange} of 3. The numbers in the grid cells are the \ac{PE} numbers +in the parent team. The rows are the \VAR{xteam}s. The columns are the \VAR{yteam}s. + +\begin{center} +\begin{tabular}{|l|l|l|l|} + \hline + & yteam & yteam & yteam \\ + & x=0 & x=1 & x=2 \\ \hline + xteam, y=0 & 0 & 1 & 2 \\ \hline + xteam, y=1 & 3 & 4 & 5 \\ \hline + xteam, y=2 & 6 & 7 & 8 \\ \hline + xteam, y=3 & 9 \\ + \cline{0-1} +\end{tabular} +\end{center} + +It would be legal, for example, if \acp{PE} 0, 3, 6, 9 specified a different value +for \VAR{yaxis\_config} than all of the other \acp{PE}, as long as the configuration parameters match +for all \acp{PE} in each of the new teams. + +See the description of team handles and predefined teams at the top of section +\ref{subsec:team} for more information about team handle semantics and usage. +} + +\begin{apiexamples} + + \apicexample + {The following example demonstrates the use of 2D Cartesian split in a + \Cstd[11] program. This example shows how multiple 2D splits can be used + to generate a 3D Cartesian split. This method can be extrapolated to + generate splits of any number of dimensions.} + {./example_code/shmem_team_split_2D.c} + { + The example above splits \LibHandleRef{SHMEM\_TEAM\_WORLD} into a 3D team + with dimensions 3x4xN. For example, if \VAR{npes} = 16, \VAR{xdim} = 3, + and \VAR{ydim} = 4, then the final dimensions are 3x4x2. In this case, the + first split of \LibHandleRef{SHMEM\_TEAM\_WORLD} results in 6 \VAR{xteams} + and 3 \VAR{yzteams}: + + \begin{center} + \begin{tabular}{|l|l|l|l|l|} + \hline + \multicolumn{2}{|c|}{} & \multicolumn{3}{c|}{\VAR{yzteam}} \\ \cline{3-5} + \multicolumn{2}{|c|}{} & \VAR{x} = 0 & \VAR{x} = 1 & \VAR{x} = 2 \\ \hline +\multirow{6}{*}{\VAR{xteam}} & \VAR{yz} = 0 & 0 & 1 & 2 \\ \cline{2-5} + & \VAR{yz} = 1 & 3 & 4 & 5 \\ \cline{2-5} + & \VAR{yz} = 2 & 6 & 7 & 8 \\ \cline{2-5} + & \VAR{yz} = 3 & 9 & 10 & 11 \\ \cline{2-5} + & \VAR{yz} = 4 & 12 & 13 & 14 \\ \cline{2-5} + & \VAR{yz} = 5 & 15 \\ + \cline{0-2} + \end{tabular} + \end{center} + + The second split of \VAR{yzteam} for \VAR{x} = 0, \VAR{ydim} = 4 results in 2 + \VAR{yteams} and 4 \VAR{zteams}: + + + \begin{center} + \begin{tabular}{|l|l|l|l|l|l|} + \hline + \multicolumn{2}{|c|}{} & \multicolumn{4}{c|}{\VAR{zteam}} \\ \cline{3-6} + \multicolumn{2}{|c|}{} & \VAR{y} = 0 & \VAR{y} = 1 & \VAR{y} = 2 & \VAR{y} = 3 \\ \hline +\multirow{2}{*}{\VAR{yteam}} & \VAR{z} = 0 & 0 & 3 & 6 & 9 \\ \cline{2-6} + & \VAR{z} = 1 & 12 & 15 \\ + \cline{0-3} + \end{tabular} + \end{center} + + The second split of \VAR{yzteam} for \VAR{x} = 1, \VAR{ydim} = 4 results in + 2 \VAR{yteams} and 4 \VAR{zteams}: + + \begin{center} + \begin{tabular}{|l|l|l|l|l|l|} + \hline + \multicolumn{2}{|c|}{} & \multicolumn{4}{c|}{\VAR{zteam}} \\ \cline{3-6} + \multicolumn{2}{|c|}{} & \VAR{y} = 0 & \VAR{y} = 1 & \VAR{y} = 2 & \VAR{y} = 3 \\ \hline +\multirow{2}{*}{\VAR{yteam}} & \VAR{z} = 0 & 1 & 4 & 7 & 10 \\ \cline{2-6} + & \VAR{z} = 1 & 13 \\ + \cline{0-2} + \end{tabular} + \end{center} + + The second split of \VAR{yzteam} for \VAR{x} = 2, \VAR{ydim} = 4 results in + 2 \VAR{yteams} and 4 \VAR{zteams}: + + \begin{center} + \begin{tabular}{|l|l|l|l|l|l|} + \hline + \multicolumn{2}{|c|}{} & \multicolumn{4}{c|}{\VAR{zteam}} \\ \cline{3-6} + \multicolumn{2}{|c|}{} & \VAR{y} = 0 & \VAR{y} = 1 & \VAR{y} = 2 & \VAR{y} = 3 \\ \hline +\multirow{2}{*}{\VAR{yteam}} & \VAR{z} = 0 & 2 & 5 & 8 & 11 \\ \cline{2-6} + & \VAR{z} = 1 & 14 \\ + \cline{0-2} + \end{tabular} + \end{center} + + The final number of teams for each dimension are: + \begin{itemize} + \item 6 \VAR{xteams}: these are teams where (\VAR{z},\VAR{y}) is fixed and \VAR{x} varies. + \item 6 \VAR{yteams}: these are teams where (\VAR{x},\VAR{z}) is fixed and \VAR{y} varies. + \item 12 \VAR{zteams}: these are teams where (\VAR{x},\VAR{y}) is fixed and \VAR{z} varies. + \end{itemize} + + The expected output is: \\ + \begin{small} + \texttt{ + (0, 0, 0) is me = 0 \\ + (1, 0, 0) is me = 1 \\ + (2, 0, 0) is me = 2 \\ + (0, 1, 0) is me = 3 \\ + (1, 1, 0) is me = 4 \\ + (2, 1, 0) is me = 5 \\ + (0, 2, 0) is me = 6 \\ + (1, 2, 0) is me = 7 \\ + (2, 2, 0) is me = 8 \\ + (0, 3, 0) is me = 9 \\ + (1, 3, 0) is me = 10 \\ + (2, 3, 0) is me = 11 \\ + (0, 0, 1) is me = 12 \\ + (1, 0, 1) is me = 13 \\ + (2, 0, 1) is me = 14 \\ + (0, 1, 1) is me = 15 + } + \end{small} +} + +\end{apiexamples} + +\end{apidefinition} diff --git a/content/shmem_team_split_strided.tex b/content/shmem_team_split_strided.tex new file mode 100644 index 000000000..6355dc950 --- /dev/null +++ b/content/shmem_team_split_strided.tex @@ -0,0 +1,117 @@ +\apisummary{ +Create a new \openshmem team from a subset of the existing parent team \acp{PE}, +where the subset is defined by the +\ac{PE} triplet (\VAR{start}, \VAR{stride}, and \VAR{size}) supplied to the routine.} + +\begin{apidefinition} + +\begin{Csynopsis} +int @\FuncDecl{shmem\_team\_split\_strided}@(shmem_team_t parent_team, int start, int stride, int size, + const shmem_team_config_t *config, long config_mask, shmem_team_t *new_team); +\end{Csynopsis} + +\begin{apiarguments} +\apiargument{IN}{parent\_team}{An \openshmem team.} + +\apiargument{IN}{start}{The lowest \ac{PE} number of the subset of \acp{PE} from +the parent team that will form the new team.} + +\apiargument{IN}{stride}{The stride between team \ac{PE} +numbers in the parent team that comprise the subset of \acp{PE} that will form +the new team.} + +\apiargument{IN}{size}{The number of \acp{PE} from the parent team in the subset +of \acp{PE} that will form the new team.} + +\apiargument{IN}{config}{ + A pointer to the configuration parameters for the new team.} + +\apiargument{IN}{config\_mask}{ + The bitwise mask representing the set of configuration parameters to use + from \VAR{config}.} + +\apiargument{OUT}{new\_team}{A new \openshmem team handle, representing a \ac{PE} +subset of all the \acp{PE} in the parent team that is created from +the \ac{PE} triplet provided.} + +\end{apiarguments} + +\apidescription{ +The \FUNC{shmem\_team\_split\_strided} routine is a collective routine. +It creates a new \openshmem team from a subset of the existing parent team, +where the \ac{PE} subset is defined by the triplet of arguments +(\VAR{start}, \VAR{stride}, \VAR{size}). +A valid triplet is one such that: +\begin{equation*} + start + stride \cdot i \in \mathbb{Z}_N + \hspace{0.35em} + \forall + \hspace{0.35em} + i \in \mathbb{Z}_{size} +\end{equation*} +where $N$ is the number of \acp{PE} in the parent team. + +This routine must be called by all \acp{PE} in the parent team. +All \acp{PE} must provide the same values for the \ac{PE} triplet. +This routine will return a \VAR{new\_team} containing the \ac{PE} +subset specified by the triplet and ordered by the existing global +\ac{PE} number. + +On successful creation of the new team, the \VAR{new\_team} handle +will reference a valid team for the subset of \acp{PE} in the parent +team specified by the triplet. +Those \acp{PE} in the parent team that are not in the subset specified +by the triplet will have \VAR{new\_team} assigned to +\LibConstRef{SHMEM\_TEAM\_INVALID}. + +The \VAR{config} argument specifies team configuration parameters, which are +described in Section~\ref{subsec:shmem_team_config_t}. + +The \VAR{config\_mask} argument is a bitwise mask representing the set of +configuration parameters to use from \VAR{config}. +A \VAR{config\_mask} value of \CONST{0} indicates that the team +should be created with the default values for all configuration parameters. +See Section~\ref{subsec:shmem_team_config_t} for field mask names and +default configuration parameters. + +When \VAR{parent\_team} specifies an invalid team, if \VAR{parent\_team} +compares equal to \LibConstRef{SHMEM\_TEAM\_INVALID}, then no new team +will be created and \VAR{new\_team} will be assigned the value +\LibConstRef{SHMEM\_TEAM\_INVALID}; otherwise, the behavior is undefined. + +If an invalid \ac{PE} triplet is provided, then the \VAR{new\_team} +will not be created. + +If \VAR{new\_team} cannot be created, then it will be assigned the value +\LibConstRef{SHMEM\_TEAM\_INVALID}. +} + +\apireturnvalues{ + Zero on successful creation of \VAR{new\_team}, nonzero otherwise. +} + +\apinotes{ + It is important to note the use of the less restrictive + \VAR{stride} argument instead of \VAR{logPE\_stride}. This method of + creating a team with an arbitrary set of \acp{PE} is inherently restricted + by its parameters, but allows for many additional use-cases over using a + \VAR{logPE\_stride} parameter, and may provide an easier transition for + existing \openshmem programs to create and use \openshmem teams. + + See the description of team handles and predefined teams at the top of + Section~\ref{subsec:team} for more information about semantics and usage. +} + +\begin{apiexamples} + + \apicexample + {The following example demonstrates the use of strided split in a + \Cstd[11] program. The program creates a new team of all even number + \acp{PE} from the default team, then retrieves the \ac{PE} number and + team size on all \acp{PE} that are members of the new team.} + {./example_code/shmem_team_split_strided.c} + {} + +\end{apiexamples} + +\end{apidefinition} diff --git a/content/shmem_team_translate_pe.tex b/content/shmem_team_translate_pe.tex new file mode 100644 index 000000000..d3e8c772a --- /dev/null +++ b/content/shmem_team_translate_pe.tex @@ -0,0 +1,55 @@ +\apisummary{ + Translate a given \ac{PE} number from one team to the corresponding + \ac{PE} number in another team. +} + +\begin{apidefinition} + +\begin{Csynopsis} +int @\FuncDecl{shmem\_team\_translate\_pe}@(shmem_team_t src_team, int src_pe, + shmem_team_t dest_team); +\end{Csynopsis} + +\begin{apiarguments} +\apiargument{IN}{src\_team}{A valid SHMEM team handle.} +\apiargument{IN}{src\_pe}{A \ac{PE} number in src\_team.} +\apiargument{IN}{dest\_team}{A valid SHMEM team handle.} +\end{apiarguments} + +\apidescription{ +The \FUNC{shmem\_team\_translate\_pe} routine will translate a given \ac{PE} number +to the corresponding \ac{PE} number in another team. +Specifically, given the \VAR{src\_pe} in \VAR{src\_team}, this routine returns that +\ac{PE}'s number in \VAR{dest\_team}. If \VAR{src\_pe} is not a member of both the +\VAR{src\_team} and \VAR{dest\_team}, a value of \CONST{-1} is returned. + + +If either of the \VAR{src\_team} or \VAR{dest\_team} handle is invalid, the behavior is undefined. +} + +\apireturnvalues{ +The specified \ac{PE}'s number in the \VAR{dest\_team}, or a value of \CONST{-1} if any +team handle arguments are invalid or the \VAR{src\_pe} is not in both the source and destination teams. +} + +\apinotes{ + If \LibHandleRef{SHMEM\_TEAM\_WORLD} is provided as the + \VAR{dest\_team} parameter, this routine acts as a global \ac{PE} + number translator and will return the corresponding + \LibHandleRef{SHMEM\_TEAM\_WORLD} number. +} + +\begin{apiexamples} + + \apicexample + {The following example demonstrates the use of the team \ac{PE} + number translation routine. The program makes a new team of all + of the even number \acp{PE} in the default team. Then, all \acp{PE} + in the new team acquire their \ac{PE} number in the new team + and translate it to the \ac{PE} number in the default team.} + {./example_code/shmem_team_translate_pe.c} + {} + +\end{apiexamples} + +\end{apidefinition} diff --git a/content/shmem_test.tex b/content/shmem_test.tex index ac739fea2..62935ebe0 100644 --- a/content/shmem_test.tex +++ b/content/shmem_test.tex @@ -29,7 +29,12 @@ \apidescription{ \FUNC{shmem\_test} tests the numeric comparison of the symmetric object pointed to by \VAR{ivar} with the value \VAR{cmp\_value} according to the - comparison operator \VAR{cmp}. + comparison operator \VAR{cmp}. The \VAR{ivar} object at the + calling \ac{PE} may be updated by an \ac{AMO} performed by a thread located + within the calling \ac{PE} or within another \ac{PE}. + + Implementations must ensure that \FUNC{shmem\_test} does not return 1 before + the update of the memory indicated by \VAR{ivar} is fully complete. } \apireturnvalues{ diff --git a/content/shmem_test_all.tex b/content/shmem_test_all.tex index 601cf4f3b..ad3bd3dc3 100644 --- a/content/shmem_test_all.tex +++ b/content/shmem_test_all.tex @@ -34,11 +34,15 @@ \apidescription{ The \FUNC{shmem\_test\_all} routine indicates whether all entries in the test set specified by \VAR{ivars} and \VAR{status} have satisfied the test - condition at the calling \ac{PE}. This routine does not block and returns zero if - not all entries in \VAR{ivars} satisfied the test condition. This routine - compares each of the \VAR{nelems} elements in the \VAR{ivars} array with - the value \VAR{cmp\_value} according to the comparison operator \VAR{cmp} - at the calling \ac{PE}. + condition at the calling \ac{PE}. The \VAR{ivars} objects at the calling + \ac{PE} may be updated by an \ac{AMO} performed by a thread located within + the calling \ac{PE} or within another \ac{PE}. + This routine does not block and returns zero if + not all entries in \VAR{ivars} satisfied the test condition. + This routine compares each element of the \VAR{ivars} array in the + test set with the value \VAR{cmp\_value} according to the comparison + operator \VAR{cmp} at the calling \ac{PE}. + If \VAR{nelems} is 0, the test set is empty and this routine returns 1. The optional \VAR{status} is a mask array of length \VAR{nelems} where each element @@ -50,6 +54,9 @@ ignored and all elements in \VAR{ivars} are included in the test set. The \VAR{ivars}, \VAR{indices}, and \VAR{status} arrays must not overlap in memory. + + Implementations must ensure that \FUNC{shmem\_test\_all} does not return 1 + before the update of the memory indicated by \VAR{ivars} is fully complete. } \apireturnvalues{ diff --git a/content/shmem_test_all_vector.tex b/content/shmem_test_all_vector.tex new file mode 100644 index 000000000..83ecd389e --- /dev/null +++ b/content/shmem_test_all_vector.tex @@ -0,0 +1,75 @@ +\apisummary{ + Indicate whether all variables within an array of variables on the local + \ac{PE} meet the specified test conditions. +} + +\begin{apidefinition} + +\begin{C11synopsis} +int @\FuncDecl{shmem\_test\_all\_vector}@(TYPE *ivars, size_t nelems, const int *status, int cmp, + TYPE *cmp_values); +\end{C11synopsis} +where \TYPE{} is one of the point-to-point synchronization types specified by +Table \ref{p2psynctypes}. + +\begin{Csynopsis} +int @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_test\_all\_vector}@(TYPE *ivars, size_t nelems, const int *status, int cmp, + TYPE *cmp_values); +\end{Csynopsis} +where \TYPE{} is one of the point-to-point synchronization types and has a +corresponding \TYPENAME{} specified by Table \ref{p2psynctypes}. + +\begin{apiarguments} + + \apiargument{IN}{ivars}{A pointer to an array of remotely accessible data + objects.} + \apiargument{IN}{nelems}{The number of elements in the \VAR{ivars} array.} + \apiargument{IN}{status}{An optional mask array of length \VAR{nelems} + that indicates which elements in \VAR{ivars} are excluded from the test set.} + \apiargument{IN}{cmp}{A comparison operator from Table~\ref{p2p-consts} that + compares elements of \VAR{ivars} with elements of \VAR{cmp\_values}.} + \apiargument{IN}{cmp\_values}{An array of length \VAR{nelems} + containing values to be compared with the respective objects in \VAR{ivars}.} + +\end{apiarguments} + +\apidescription{ + The \FUNC{shmem\_test\_all\_vector} routine indicates whether all + entries in the test set specified by \VAR{ivars} and \VAR{status} have + satisfied the test condition at the calling \ac{PE}. The \VAR{ivars} + objects at the calling \ac{PE} may be updated by an \ac{AMO} performed by a + thread located within the calling \ac{PE} or within another \ac{PE}. + This routine does not + block and returns zero if not all entries in \VAR{ivars} satisfied the test + conditions. This routine compares each element of the + \VAR{ivars} array in the test set with each respective value in + \VAR{cmp\_values} according to the comparison operator \VAR{cmp} at the + calling \ac{PE}. If \VAR{nelems} is 0, the test set is empty and this + routine returns 1. + + The optional \VAR{status} is a mask array of length \VAR{nelems} where each element + corresponds to the respective element in \VAR{ivars} and indicates whether + the element is excluded from the test set. Elements of \VAR{status} set to + 0 will be included in the test set, and elements set to 1 will be ignored. If all elements + in \VAR{status} are set to 1 or \VAR{nelems} is 0, the test set is empty + and this routine returns 0. If \VAR{status} is a null pointer, it is + ignored and all elements in \VAR{ivars} are included in the test set. The + \VAR{ivars}, \VAR{indices}, and \VAR{status} arrays must not overlap in + memory. + + Implementations must ensure that \FUNC{shmem\_test\_all\_vector} does not + return 1 before the update of the memory indicated by \VAR{ivars} is fully + complete. +} + +\apireturnvalues{ + \FUNC{shmem\_test\_all\_vector} returns 1 if all variables in \VAR{ivars} + satisfy the test conditions or if \VAR{nelems} is 0, otherwise this routine + returns 0. +} + +\apinotes{ + None. +} + +\end{apidefinition} diff --git a/content/shmem_test_any.tex b/content/shmem_test_any.tex index bd2462242..eee615887 100644 --- a/content/shmem_test_any.tex +++ b/content/shmem_test_any.tex @@ -35,11 +35,15 @@ \apidescription{ The \FUNC{shmem\_test\_any} routine indicates whether any entry in the test set specified by \VAR{ivars} and \VAR{status} has satisfied the test - condition at the calling \ac{PE}. This routine does not block and returns \CONST{SIZE\_MAX} if - no entries in \VAR{ivars} satisfied the test condition. This routine - compares each of the \VAR{nelems} elements in the \VAR{ivars} array with - the value \VAR{cmp\_value} according to the comparison operator \VAR{cmp} - at the calling \ac{PE}. The order in which these elements are tested is + condition at the calling \ac{PE}. The \VAR{ivars} objects at the calling + \ac{PE} may be updated by an \ac{AMO} performed by a thread located within + the calling \ac{PE} or within another \ac{PE}. + This routine does not block and returns \CONST{SIZE\_MAX} if + no entries in \VAR{ivars} satisfied the test condition. + This routine compares each element of the \VAR{ivars} array in the + test set with the value \VAR{cmp\_value} according to the comparison + operator \VAR{cmp} at the calling \ac{PE}. + The order in which these elements are tested is unspecified. If an entry $i$ in \VAR{ivars} within the test set satisfies the test condition, a series of calls to \FUNC{shmem\_test\_any} must eventually return $i$. @@ -53,6 +57,10 @@ null pointer, it is ignored and all elements in \VAR{ivars} are included in the test set. The \VAR{ivars} and \VAR{status} arrays must not overlap in memory. + + Implementations must ensure that \FUNC{shmem\_test\_any} does not return an + index before the update of the memory indicated by the corresponding + \VAR{ivars} element is fully complete. } \apireturnvalues{ diff --git a/content/shmem_test_any_vector.tex b/content/shmem_test_any_vector.tex new file mode 100644 index 000000000..f27ea1caa --- /dev/null +++ b/content/shmem_test_any_vector.tex @@ -0,0 +1,78 @@ +\apisummary{ + Indicate whether any one variable within an array of variables on the local + \ac{PE} meets its specified test condition. +} + +\begin{apidefinition} + +\begin{C11synopsis} +size_t @\FuncDecl{shmem\_test\_any\_vector}@(TYPE *ivars, size_t nelems, const int *status, int cmp, + TYPE *cmp_values); +\end{C11synopsis} +where \TYPE{} is one of the point-to-point synchronization types specified by +Table \ref{p2psynctypes}. + +\begin{Csynopsis} +size_t @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_test\_any\_vector}@(TYPE *ivars, size_t nelems, const int *status, + int cmp, TYPE *cmp_values); +\end{Csynopsis} +where \TYPE{} is one of the point-to-point synchronization types and has a +corresponding \TYPENAME{} specified by Table \ref{p2psynctypes}. + +\begin{apiarguments} + + \apiargument{IN}{ivars}{A pointer to an array of remotely accessible data + objects.} + \apiargument{IN}{nelems}{The number of elements in the \VAR{ivars} array.} + \apiargument{IN}{status}{An optional mask array of length \VAR{nelems} + that indicates which elements in \VAR{ivars} are excluded from the test set.} + \apiargument{IN}{cmp}{A comparison operator from Table~\ref{p2p-consts} that + compares elements of \VAR{ivars} with elements of \VAR{cmp\_values}.} + \apiargument{IN}{cmp\_values}{An array of length \VAR{nelems} + containing values to be compared with the respective objects in \VAR{ivars}.} + +\end{apiarguments} + +\apidescription{ + The \FUNC{shmem\_test\_any\_vector} routine indicates whether any + entry in the test set specified by \VAR{ivars} and \VAR{status} has + satisfied the test condition at the calling \ac{PE}. The \VAR{ivars} + objects at the calling \ac{PE} may be updated by an \ac{AMO} performed by a + thread located within the calling \ac{PE} or within another \ac{PE}. + This routine does not + block and returns \CONST{SIZE\_MAX} if no entries in \VAR{ivars} satisfied + the test condition. This routine compares each element of the + \VAR{ivars} array in the test set with each respective value in + \VAR{cmp\_values} according to the comparison operator \VAR{cmp} at the + calling \ac{PE}. The order in which these elements are tested is + unspecified. If an entry $i$ in \VAR{ivars} within the test set satisfies + the test condition, a series of calls to + \FUNC{shmem\_test\_any\_vector} must eventually return $i$. + + The optional \VAR{status} is a mask array of length \VAR{nelems} where each + element corresponds to the respective element in \VAR{ivars} and indicates + whether the element is excluded from the test set. Elements of + \VAR{status} set to 0 will be included in the test set, and elements set to + 1 will be ignored. If all elements in \VAR{status} are set to 1 or + \VAR{nelems} is 0, the test set is empty and this routine returns + \CONST{SIZE\_MAX}. If \VAR{status} is a null pointer, it is ignored and + all elements in \VAR{ivars} are included in the test set. The \VAR{ivars} + and \VAR{status} arrays must not overlap in memory. + + Implementations must ensure that \FUNC{shmem\_test\_any\_vector} does not + return an index before the update of the memory indicated by the + corresponding \VAR{ivars} element is fully complete. +} + +\apireturnvalues{ + \FUNC{shmem\_test\_any\_vector} returns the index of an element in the \VAR{ivars} + array that satisfies the test condition. If the test set is empty or no + conditions in the test set are satisfied, this routine returns \CONST{SIZE\_MAX}. +} + +\apinotes{ + None. +} + + +\end{apidefinition} diff --git a/content/shmem_test_some.tex b/content/shmem_test_some.tex index e619e293a..750ecc574 100644 --- a/content/shmem_test_some.tex +++ b/content/shmem_test_some.tex @@ -37,11 +37,15 @@ \apidescription{ The \FUNC{shmem\_test\_some} routine indicates whether at least one entry in the test set specified by \VAR{ivars} and \VAR{status} satisfies the - test condition at the calling \ac{PE}. This routine does not block and returns zero if - no entries in \VAR{ivars} satisfied the test condition. This routine - compares each element of the \VAR{ivars} array in the test set with the - value \VAR{cmp\_value} according to the comparison operator \VAR{cmp} at - the calling \ac{PE}. This routine tests all elements of \VAR{ivars} in the + test condition at the calling \ac{PE}. The \VAR{ivars} objects at the + calling \ac{PE} may be updated by an \ac{AMO} performed by a thread located + within the calling \ac{PE} or within another \ac{PE}. + This routine does not block and returns zero if + no entries in \VAR{ivars} satisfied the test condition. + This routine compares each element of the \VAR{ivars} array in the + test set with the value \VAR{cmp\_value} according to the comparison + operator \VAR{cmp} at the calling \ac{PE}. + This routine tests all elements of \VAR{ivars} in the test set at least once, and the order in which the elements are tested is unspecified. If an entry $i$ in \VAR{ivars} within the test set satisfies the test condition, a series of calls to \FUNC{shmem\_test\_some} must @@ -69,6 +73,10 @@ empty and this routine returns 0. If \VAR{status} is a null pointer, it is ignored and all elements in \VAR{ivars} are included in the test set. The \VAR{ivars}, \VAR{indices}, and \VAR{status} arrays must not overlap in memory. + + Implementations must ensure that \FUNC{shmem\_test\_some} does not return + indices before the updates of the memory indicated by the corresponding + \VAR{ivars} elements are fully complete. } \apireturnvalues{ diff --git a/content/shmem_test_some_vector.tex b/content/shmem_test_some_vector.tex new file mode 100644 index 000000000..fdf5c26bb --- /dev/null +++ b/content/shmem_test_some_vector.tex @@ -0,0 +1,89 @@ +\apisummary{ + Indicate whether at least one variable within an array of variables on the + local \ac{PE} meets its specified test condition. +} + +\begin{apidefinition} + +\begin{C11synopsis} +size_t @\FuncDecl{shmem\_test\_some\_vector}@(TYPE *ivars, size_t nelems, size_t *indices, const int *status, + int cmp, TYPE *cmp_values); +\end{C11synopsis} +where \TYPE{} is one of the point-to-point synchronization types specified by +Table \ref{p2psynctypes}. + +\begin{Csynopsis} +size_t @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_test\_some\_vector}@(TYPE *ivars, size_t nelems, size_t *indices, + const int *status, int cmp, TYPE *cmp_values); +\end{Csynopsis} +where \TYPE{} is one of the point-to-point synchronization types and has a +corresponding \TYPENAME{} specified by Table \ref{p2psynctypes}. + +\begin{apiarguments} + + \apiargument{IN}{ivars}{A pointer to an array of remotely accessible data + objects.} + \apiargument{IN}{nelems}{The number of elements in the \VAR{ivars} array.} + \apiargument{OUT}{indices}{An array of indices of length at least + \VAR{nelems} into \VAR{ivars} that satisfied the test condition.} + \apiargument{IN}{status}{An optional mask array of length \VAR{nelems} + that indicates which elements in \VAR{ivars} are excluded from the test set.} + \apiargument{IN}{cmp}{A comparison operator from Table~\ref{p2p-consts} that + compares elements of \VAR{ivars} with elements of \VAR{cmp\_values}.} + \apiargument{IN}{cmp\_values}{An array of length \VAR{nelems} + containing values to be compared with the respective objects in \VAR{ivars}.} + +\end{apiarguments} + +\apidescription{ + The \FUNC{shmem\_test\_some\_vector} routine indicates whether at + least one entry in the test set specified by \VAR{ivars} and \VAR{status} + satisfies the test condition at the calling \ac{PE}. The \VAR{ivars} + objects at the calling \ac{PE} may be updated by an \ac{AMO} performed by a + thread located within the calling \ac{PE} or within another \ac{PE}. + This routine does not + block and returns zero if no entries in \VAR{ivars} satisfied the test + condition. This routine compares each element of the \VAR{ivars} + array in the test set with each respective value in \VAR{cmp\_values} + according to the comparison operator \VAR{cmp} at the calling \ac{PE}. + This routine tests all elements of \VAR{ivars} in the test set at least + once, and the order in which the elements are tested is unspecified. + + Upon return, the \VAR{indices} array contains the indices of the elements + in the test set that satisfied the test condition during the call to + \FUNC{shmem\_test\_some\_vector}. The return value of + \FUNC{shmem\_test\_some\_vector} is equal to the total number of + these satisfied elements. If the return value is $N$, then the first $N$ + elements of the \VAR{indices} array contain those unique indices that + satisfied the test condition. These first $N$ elements of \VAR{indices} + may be unordered with respect to the corresponding indices of \VAR{ivars}. + The array pointed to by \VAR{indices} must be at least \VAR{nelems} long. + If an entry $i$ in \VAR{ivars} within the test set satisfies the test + condition, a series of calls to \FUNC{shmem\_test\_some\_vector} + must eventually include $i$ in the \VAR{indices} array. + + The optional \VAR{status} is a mask array of length \VAR{nelems} where each element + corresponds to the respective element in \VAR{ivars} and indicates whether + the element is excluded from the test set. Elements of \VAR{status} set to + 0 will be included in the test set, and elements set to 1 will be ignored. If all + elements in \VAR{status} are set to 1 or \VAR{nelems} is 0, the test set is + empty and this routine returns 0. If \VAR{status} is a null pointer, it is ignored and all + elements in \VAR{ivars} are included in the test set. The \VAR{ivars}, + \VAR{indices}, and \VAR{status} arrays must not overlap in memory. + + Implementations must ensure that \FUNC{shmem\_test\_some\_vector} does not + return indices before the updates of the memory indicated by the + corresponding \VAR{ivars} elements are fully complete. +} + +\apireturnvalues{ + \FUNC{shmem\_test\_some\_vector} returns the number of indices returned in + the \VAR{indices} array. If the test set is empty, this routine returns 0. +} + +\apinotes{ + None. +} + + +\end{apidefinition} diff --git a/content/shmem_wait_until.tex b/content/shmem_wait_until.tex index c937f85d6..89595ad00 100644 --- a/content/shmem_wait_until.tex +++ b/content/shmem_wait_until.tex @@ -43,13 +43,9 @@ \apidescription{ The \FUNC{shmem\_wait} and \FUNC{shmem\_wait\_until} operations block until the value contained in the symmetric data object, \VAR{ivar}, at the - calling \ac{PE} satisfies the wait condition. In an \openshmem program - with single-threaded \acp{PE}, the \VAR{ivar} object at the calling \ac{PE} - may be updated by an \ac{RMA}, \ac{AMO}, or store operation performed by another - \ac{PE}. In an \openshmem program with multithreaded \acp{PE}, the - \VAR{ivar} object at the calling \ac{PE} may be updated by an \ac{RMA}, \ac{AMO}, or - store operation performed by a thread located within the calling \ac{PE} or - within another \ac{PE}. + calling \ac{PE} satisfies the wait condition. The \VAR{ivar} object at the + calling \ac{PE} may be updated by an \ac{AMO} performed by a thread located + within the calling \ac{PE} or within another \ac{PE}. These routines can be used to implement point-to-point synchronization between \acp{PE} or between threads within the same \ac{PE}. A call to @@ -58,10 +54,14 @@ to \FUNC{shmem\_wait\_until} blocks until the value of \VAR{ivar} at the calling \ac{PE} satisfies the wait condition specified by the comparison operator, \VAR{cmp}, and comparison value, \VAR{cmp\_value}. + + Implementations must ensure that \FUNC{shmem\_wait} and + \FUNC{shmem\_wait\_until} do not return before the update of the memory + indicated by \VAR{ivar} is fully complete. } \apireturnvalues{ - None. + None } \apinotes{ @@ -71,10 +71,13 @@ } \apiimpnotes{ - Implementations must ensure that \FUNC{shmem\_wait} and - \FUNC{shmem\_wait\_until} do not return before the update of the memory - indicated by \VAR{ivar} is fully complete. Partial updates to the memory - must not cause \FUNC{shmem\_wait} or \FUNC{shmem\_wait\_until} to return. + Some platforms may allow wait operations to efficiently poll or block on an + update to \VAR{ivar}. On others, an atomic read operation may be needed to + observe updates to \VAR{ivar}. On platforms where atomic read operations + incur high overhead, implementations may be able to reduce the number of + atomic reads performed by using non-atomic reads of \VAR{ivar} to wait for a + change to occur, followed by an atomic read operation to fetch the updated + value. } \end{apidefinition} diff --git a/content/shmem_wait_until_all.tex b/content/shmem_wait_until_all.tex index 8b947e099..35a92ac91 100644 --- a/content/shmem_wait_until_all.tex +++ b/content/shmem_wait_until_all.tex @@ -34,7 +34,13 @@ \apidescription{ The \FUNC{shmem\_wait\_until\_all} routine waits until all entries in the wait set specified by \VAR{ivars} and \VAR{status} have satisfied the wait condition at the - calling \ac{PE}. If \VAR{nelems} is 0, the wait set is empty and this routine returns immediately. + calling \ac{PE}. The \VAR{ivars} objects at the calling \ac{PE} may be + updated by an \ac{AMO} performed by a thread located within the calling + \ac{PE} or within another \ac{PE}. + If \VAR{nelems} is 0, the wait set is empty and this routine returns immediately. + This routine compares each element of the \VAR{ivars} array in the + wait set with the value \VAR{cmp\_value} according to the comparison + operator \VAR{cmp} at the calling \ac{PE}. This routine is semantically similar to \FUNC{shmem\_wait\_until} in Section~\ref{subsec:shmem_wait_until}, but adds support for point-to-point synchronization involving an array of @@ -49,6 +55,10 @@ immediately. If \VAR{status} is a null pointer, it is ignored and all elements in \VAR{ivars} are included in the wait set. The \VAR{ivars} and \VAR{status} arrays must not overlap in memory. + + Implementations must ensure that \FUNC{shmem\_wait\_until\_all} does not + return before the update of the memory indicated by \VAR{ivars} is fully + complete. } @@ -60,13 +70,6 @@ None. } -\apiimpnotes{ - Implementations must ensure that \FUNC{shmem\_wait\_until\_all} does not - return before the update of the memory indicated by \VAR{ivars} is fully - complete. Partial updates to the memory must not cause - \FUNC{shmem\_wait\_until\_all} to return. -} - \begin{apiexamples} \apicexample diff --git a/content/shmem_wait_until_all_vector.tex b/content/shmem_wait_until_all_vector.tex new file mode 100644 index 000000000..2e250d2d7 --- /dev/null +++ b/content/shmem_wait_until_all_vector.tex @@ -0,0 +1,72 @@ +\apisummary{ + Wait on an array of variables on the local \ac{PE} until all variables meet + the specified wait conditions. +} + +\begin{apidefinition} + +\begin{C11synopsis} +void @\FuncDecl{shmem\_wait\_until\_all\_vector}@(TYPE *ivars, size_t nelems, const int *status, int cmp, + TYPE *cmp_values); +\end{C11synopsis} +where \TYPE{} is one of the point-to-point synchronization types specified by +Table \ref{p2psynctypes}. + +\begin{Csynopsis} +void @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_wait\_until\_all\_vector}@(TYPE *ivars, size_t nelems, const int *status, int cmp, TYPE *cmp_values); +\end{Csynopsis} +where \TYPE{} is one of the point-to-point synchronization types and has a +corresponding \TYPENAME{} specified by Table~\ref{p2psynctypes}. + +\begin{apiarguments} + + \apiargument{IN}{ivars}{A pointer to an array of remotely accessible data + objects.} + \apiargument{IN}{nelems}{The number of elements in the \VAR{ivars} array.} + \apiargument{IN}{status}{An optional mask array of length \VAR{nelems} + that indicates which elements in \VAR{ivars} are excluded from the wait set.} + \apiargument{IN}{cmp}{A comparison operator from Table~\ref{p2p-consts} that + compares elements of \VAR{ivars} with elements of \VAR{cmp\_values}.} + \apiargument{IN}{cmp\_values}{An array of length \VAR{nelems} + containing values to be compared with the respective objects in \VAR{ivars}.} + +\end{apiarguments} + +\apidescription{ + The \FUNC{shmem\_wait\_until\_all\_vector} routine waits until all entries + in the wait set specified by \VAR{ivars} and \VAR{status} have satisfied + the wait conditions at the calling \ac{PE}. The \VAR{ivars} + objects at the calling \ac{PE} may be updated by an \ac{AMO} performed by a + thread located within the calling \ac{PE} or within another \ac{PE}. + If \VAR{nelems} is + 0, the wait set is empty and this routine returns immediately. + This routine compares each element of the \VAR{ivars} array in the + wait set with each respective value in \VAR{cmp\_values} according to the + comparison operator \VAR{cmp} at the calling \ac{PE}. + + The optional \VAR{status} is a mask array of length \VAR{nelems} where each + element corresponds to the respective element in \VAR{ivars} and indicates + whether the element is excluded from the wait set. Elements of + \VAR{status} set to 0 will be included in the wait set, and elements set to + 1 will be ignored. If all elements in \VAR{status} are set to 1 or + \VAR{nelems} is 0, the wait set is empty and this routine returns + immediately. If \VAR{status} is a null pointer, it is ignored and all + elements in \VAR{ivars} are included in the wait set. The \VAR{ivars} and + \VAR{status} arrays must not overlap in memory. + + Implementations must ensure that \FUNC{shmem\_wait\_until\_all\_vector} + does not return before the update of the memory indicated by \VAR{ivars} is + fully complete. +} + + +\apireturnvalues{ + None. +} + +\apinotes{ + None. +} + + +\end{apidefinition} diff --git a/content/shmem_wait_until_any.tex b/content/shmem_wait_until_any.tex index 349c1ae75..0872808c0 100644 --- a/content/shmem_wait_until_any.tex +++ b/content/shmem_wait_until_any.tex @@ -35,7 +35,13 @@ \apidescription{ The \FUNC{shmem\_wait\_until\_any} routine waits until any one entry in the wait set specified by \VAR{ivars} and \VAR{status} satisfies the wait - condition at the calling \ac{PE}. The order in which these elements are + condition at the calling \ac{PE}. The \VAR{ivars} objects at the calling + \ac{PE} may be updated by an \ac{AMO} performed by a thread located within + the calling \ac{PE} or within another \ac{PE}. + This routine compares each element of the \VAR{ivars} array in the + wait set with the value \VAR{cmp\_value} according to the comparison + operator \VAR{cmp} at the calling \ac{PE}. + The order in which these elements are waited upon is unspecified. If an entry $i$ in \VAR{ivars} within the wait set satisfies the wait condition, a series of calls to \FUNC{shmem\_wait\_until\_any} must eventually return $i$. @@ -50,6 +56,10 @@ \VAR{status} is a null pointer, it is ignored and all elements in \VAR{ivars} are included in the wait set. The \VAR{ivars} and \VAR{status} arrays must not overlap in memory. + + Implementations must ensure that \FUNC{shmem\_wait\_until\_any} does not + return before the update of the memory indicated by \VAR{ivars} is fully + complete. } \apireturnvalues{ @@ -62,13 +72,6 @@ None. } -\apiimpnotes{ - Implementations must ensure that \FUNC{shmem\_wait\_until\_any} does not - return before the update of the memory indicated by the completed index of \VAR{ivars} is fully - executed. Partial updates to the memory must not cause - \FUNC{shmem\_wait\_until\_any} to return. -} - \begin{apiexamples} \apicexample diff --git a/content/shmem_wait_until_any_vector.tex b/content/shmem_wait_until_any_vector.tex new file mode 100644 index 000000000..b7768f1b6 --- /dev/null +++ b/content/shmem_wait_until_any_vector.tex @@ -0,0 +1,85 @@ +\apisummary{ + Wait on an array of variables on the local \ac{PE} until any one variable + meets its specified wait condition. +} + +\begin{apidefinition} + +\begin{C11synopsis} +size_t @\FuncDecl{shmem\_wait\_until\_any\_vector}@(TYPE *ivars, size_t nelems, const int *status, int cmp, + TYPE *cmp_values); +\end{C11synopsis} +where \TYPE{} is one of the point-to-point synchronization types specified by +Table \ref{p2psynctypes}. + +\begin{Csynopsis} +size_t @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_wait\_until\_any\_vector}@(TYPE *ivars, size_t nelems, const int *status, + int cmp, TYPE *cmp_values); +\end{Csynopsis} +where \TYPE{} is one of the point-to-point synchronization types and has a +corresponding \TYPENAME{} specified by Table~\ref{p2psynctypes}. + +\begin{apiarguments} + + \apiargument{IN}{ivars}{A pointer to an array of remotely accessible data + objects.} + \apiargument{IN}{nelems}{The number of elements in the \VAR{ivars} array.} + \apiargument{IN}{status}{An optional mask array of length \VAR{nelems} + that indicates which elements in \VAR{ivars} are excluded from the wait set.} + \apiargument{IN}{cmp}{A comparison operator from Table~\ref{p2p-consts} that + compares elements of \VAR{ivars} with elements of \VAR{cmp\_values}.} + \apiargument{IN}{cmp\_values}{An array of length \VAR{nelems} + containing values to be compared with the respective objects in \VAR{ivars}}. + +\end{apiarguments} + +\apidescription{ + The \FUNC{shmem\_wait\_until\_any\_vector} routine waits until any one + entry in the wait set specified by \VAR{ivars} and \VAR{status} satisfies + the wait condition at the calling \ac{PE}. The \VAR{ivars} objects at the + calling \ac{PE} may be updated by an \ac{AMO} performed by a thread located + within the calling \ac{PE} or within another \ac{PE}. + This routine compares + each element of the \VAR{ivars} array in the wait set with each respective + value in \VAR{cmp\_values} according to the comparison operator \VAR{cmp} + at the calling \ac{PE}. The order in which these elements are waited upon + is unspecified. If an entry $i$ in \VAR{ivars} within the wait set + satisfies the wait condition, a series of calls to + \FUNC{shmem\_wait\_until\_any\_vector} must eventually return $i$. + + The optional \VAR{status} is a mask array of length \VAR{nelems} where each + element corresponds to the respective element in \VAR{ivars} and indicates + whether the element is excluded from the wait set. Elements of + \VAR{status} set to 0 will be included in the wait set, and elements set to + 1 will be ignored. If all elements in \VAR{status} are set to 1 or + \VAR{nelems} is 0, the wait set is empty and this routine returns + \CONST{SIZE\_MAX}. If \VAR{status} is a null pointer, it is ignored and + all elements in \VAR{ivars} are included in the wait set. The \VAR{ivars} + and \VAR{status} arrays must not overlap in memory. + + Implementations must ensure that \FUNC{shmem\_wait\_until\_any\_vector} + does not return before the update of the memory indicated by \VAR{ivars} is + fully complete. +} + +\apireturnvalues{ + \FUNC{shmem\_wait\_until\_any\_vector} returns the index of an element in the + \VAR{ivars} array that satisfies the wait condition. If the wait set is + empty, this routine returns \CONST{SIZE\_MAX}. +} + +\apinotes{ + None. +} + +\begin{apiexamples} + \apicexample + {The following \Cstd[11] example demonstrates the use of + \FUNC{shmem\_wait\_until\_any\_vector} to wait on values that differ + between even PEs and odd PEs.} + {./example_code/shmem_wait_until_any_vector.c} + {} +\end{apiexamples} + +\end{apidefinition} + diff --git a/content/shmem_wait_until_some.tex b/content/shmem_wait_until_some.tex index 2729f09ba..d21709649 100644 --- a/content/shmem_wait_until_some.tex +++ b/content/shmem_wait_until_some.tex @@ -37,7 +37,13 @@ \apidescription{ The \FUNC{shmem\_wait\_until\_some} routine waits until at least one entry in the wait set specified by \VAR{ivars} and \VAR{status} satisfies the - wait condition at the calling \ac{PE}. This routine tests all elements of + wait condition at the calling \ac{PE}. The \VAR{ivars} objects at the + calling \ac{PE} may be updated by an \ac{AMO} performed by a thread located + within the calling \ac{PE} or within another \ac{PE}. + This routine compares each element of the \VAR{ivars} array in the + wait set with the value \VAR{cmp\_value} according to the comparison + operator \VAR{cmp} at the calling \ac{PE}. + This routine tests all elements of \VAR{ivars} in the wait set at least once, and the order in which the elements are waited upon is unspecified. @@ -66,6 +72,10 @@ and all elements in \VAR{ivars} are included in the wait set. The \VAR{ivars}, \VAR{indices}, and \VAR{status} arrays must not overlap in memory. + + Implementations must ensure that \FUNC{shmem\_wait\_until\_some} does not + return before the update of the memory indicated by \VAR{ivars} is fully + complete. } @@ -78,13 +88,6 @@ None. } -\apiimpnotes{ - Implementations must ensure that \FUNC{shmem\_wait\_until\_some} does not - return before the update of the memory indicated by the completed indices of \VAR{ivars} is fully - executed. Partial updates to the memory must not cause - \FUNC{shmem\_wait\_until\_some} to return. -} - \begin{apiexamples} \apicexample diff --git a/content/shmem_wait_until_some_vector.tex b/content/shmem_wait_until_some_vector.tex new file mode 100644 index 000000000..30dc169f3 --- /dev/null +++ b/content/shmem_wait_until_some_vector.tex @@ -0,0 +1,93 @@ +\apisummary{ + Wait on an array of variables on the local \ac{PE} until at least one + variable meets the its specified wait condition. +} + +\begin{apidefinition} + +\begin{C11synopsis} +size_t @\FuncDecl{shmem\_wait\_until\_some\_vector}@(TYPE *ivars, size_t nelems, size_t *indices, + const int *status, int cmp, TYPE *cmp_values); +\end{C11synopsis} +where \TYPE{} is one of the point-to-point synchronization types specified by +Table \ref{p2psynctypes}. + +\begin{Csynopsis} +size_t @\FuncDecl{shmem\_\FuncParam{TYPENAME}\_wait\_until\_some\_vector}@(TYPE *ivars, size_t nelems, size_t *indices, + const int *status, int cmp, TYPE *cmp_values); +\end{Csynopsis} +where \TYPE{} is one of the point-to-point synchronization types and has a +corresponding \TYPENAME{} specified by Table~\ref{p2psynctypes}. + +\begin{apiarguments} + + \apiargument{IN}{ivars}{A pointer to an array of remotely accessible data + objects.} + \apiargument{IN}{nelems}{The number of elements in the \VAR{ivars} array.} + \apiargument{OUT}{indices}{An array of indices of length at least + \VAR{nelems} into \VAR{ivars} that satisfied the wait condition.} + \apiargument{IN}{status}{An optional mask array of length \VAR{nelems} + that indicates which elements in \VAR{ivars} are excluded from the wait set.} + \apiargument{IN}{cmp}{A comparison operator from Table~\ref{p2p-consts} that + compares elements of \VAR{ivars} with elements of \VAR{cmp\_values}.} + \apiargument{IN}{cmp\_values}{An array of length \VAR{nelems} + containing values to be compared with the respective objects in \VAR{ivars}.} + +\end{apiarguments} + +\apidescription{ + The \FUNC{shmem\_wait\_until\_some\_vector} routine waits until + at least one entry in the wait set specified by \VAR{ivars} and + \VAR{status} satisfies the wait condition at the calling \ac{PE}. + The \VAR{ivars} objects at the calling \ac{PE} may be updated by an + \ac{AMO} performed by a thread located within the calling \ac{PE} or within + another \ac{PE}. + This routine compares each element of the \VAR{ivars} array in the + wait set with each respective value in \VAR{cmp\_values} according to the + comparison operator \VAR{cmp} at the calling \ac{PE}. This routine tests + all elements of \VAR{ivars} in the wait set at least once, and the order in + which the elements are waited upon is unspecified. + + Upon return, the \VAR{indices} array contains the indices of at least one + element in the wait set that satisfied the wait condition during the call + to \FUNC{shmem\_wait\_until\_some\_vector}. The return value of + \FUNC{shmem\_wait\_until\_some\_vector} is equal to the total + number of these satisfied elements. For a given return value $N$, the + first $N$ elements of the \VAR{indices} array contain those unique indices + that satisfied the wait condition. These first $N$ elements of + \VAR{indices} may be unordered with respect to the corresponding indices of + \VAR{ivars}. The array pointed to by \VAR{indices} must be at least + \VAR{nelems} long. If an entry $i$ in \VAR{ivars} within the wait set + satisfies the wait condition, a series of calls to + \FUNC{shmem\_wait\_until\_some\_vector} must eventually include + $i$ in the \VAR{indices} array. + + The optional \VAR{status} is a mask array of length \VAR{nelems} where each + element corresponds to the respective element in \VAR{ivars} and indicates + whether the element is excluded from the wait set. Elements of + \VAR{status} set to 0 will be included in the wait set, and elements set to + 1 will be ignored. If all elements in \VAR{status} are set to 1 or + \VAR{nelems} is 0, the wait set is empty and this routine returns 0. + If \VAR{status} is a null pointer, it is ignored + and all elements in \VAR{ivars} are included in the wait set. The + \VAR{ivars}, \VAR{indices}, and \VAR{status} arrays must not overlap in + memory. + + Implementations must ensure that \FUNC{shmem\_wait\_until\_some\_vector} + does not return before the update of the memory indicated by \VAR{ivars} is + fully complete. +} + + +\apireturnvalues{ + \FUNC{shmem\_wait\_until\_some\_vector} returns the number of + indices returned in the \VAR{indices} array. If the wait set is empty, this + routine returns 0. +} + +\apinotes{ + None. +} + + +\end{apidefinition} diff --git a/content/teams_intro.tex b/content/teams_intro.tex new file mode 100644 index 000000000..ddca69dcc --- /dev/null +++ b/content/teams_intro.tex @@ -0,0 +1,100 @@ +The \acp{PE} in an \openshmem program communicate using either +point-to-point routines---such as \ac{RMA} and \ac{AMO} routines---which specify the \ac{PE} number of the target +\ac{PE}, or collective routines, which operate over a set of \acp{PE}. +In \openshmem, teams allow programs to group a set of \acp{PE} for +communication. +Team-based collective communications operate across all the \acp{PE} +in a valid team. +Point-to-point communication can make use of team-relative \ac{PE} +numbering through team-based contexts (see Section~\ref{sec:ctx}) or +\ac{PE} number translation. + +\subsubsection*{Predefined and Program-Defined Teams} + +An \openshmem team may be predefined (i.e., provided by the \openshmem +library) or defined by the \openshmem program. +A program-defined team is created by ``splitting'' a parent team into +one or more new teams---each with some subset of \acp{PE} of the +parent team---via one of the \FUNC{shmem\_team\_split\_*} routines. + +All predefined teams are valid for the duration of the \openshmem +portion of an application. +Any team successfully created by a \FUNC{shmem\_team\_split\_*} +routine is valid until it is destroyed. +All valid teams have a least one member. + +\subsubsection*{Team Handles} + +A ``team handle'' is an opaque object with type \CTYPE{shmem\_team\_t} +that is used to reference a team. +Team handles are not remotely accessible objects. +The predefined teams may be accessed via the team handles listed in +Section~\ref{subsec:library_handles}. + +\openshmem communication routines that do not accept a team handle +argument operate on the default team, which may be accessed through +the \LibHandleRef{SHMEM\_TEAM\_WORLD} handle. +The default team encompasses the set of all \acp{PE} in the \openshmem +program, and a \ac{PE} number in the default team is the same as the +value returned by \FUNC{shmem\_my\_pe}. + +A team handle may be initialized to or assigned the value +\CONST{SHMEM\_TEAM\_INVALID} to indicate that handle does not +reference a valid team. +When managed in this way, applications can use an equality comparison +to test whether a given team handle references a valid team. + +\subsubsection*{Thread Safety} + +When it is allowed by the threading model provided by the OpenSHMEM +library, a team may be used concurrently in non-collective operations +(e.g., \FUNC{shmem\_team\_my\_pe}) by multiple threads within the +\ac{PE} where it was created. +For collective operations, a team may not be used concurrently by +multiple threads in the same \ac{PE}. + +\subsubsection*{Collective Ordering} + +In \openshmem, a team object encapsulates resources used to communicate +between \acp{PE} in collective operations. When calling multiple subsequent +collective operations on a team, the collective operations---along with any +relevant team based resources---are matched across the \acp{PE} in the team +based on ordering of collective routine calls. It is the responsibility +of the \openshmem program to ensure the same ordering of collective routine calls +across all \acp{PE} in a team. + +A full discussion of collective semantics follows in Section~\ref{subsec:coll}. + +\subsubsection*{Team Creation} + +Team creation is a collective operation on the parent team object. New teams +result from a \FUNC{shmem\_team\_split\_*} routine, which takes a parent team +and other arguments and produces new teams that are a subset of the parent +team. All \acp{PE} in a parent team must participate in a split operation +to create new teams. If a \ac{PE} from the parent team is not a member of any +resulting new teams, it will receive a value of \CONST{SHMEM\_TEAM\_INVALID} +as the value for the new team handle. + +Teams that are created by a \FUNC{shmem\_team\_split\_*} routine may be +provided a configuration argument that specifies attributes of each new team. +This configuration argument is of type \CTYPE{shmem\_team\_config\_t}, which +is detailed further in Section~\ref{subsec:shmem_team_config_t}. + +\acp{PE} in a newly created teams are consecutively numbered with starting with +\ac{PE} number 0. \acp{PE} are always ordered by the existing global \ac{PE} number that +would be returned by the \FUNC{shmem\_my\_pe} routine. Team relative \ac{PE} +numbers can be used for point-to-point operations through team-based +contexts (see Section~\ref{sec:ctx}) or using the translation routine +\FUNC{shmem\_team\_translate\_pe}. + +As with any collective routine on a team, the program must ensure that there +are no simultaneous split operations occurring on the same parent team on a +given \ac{PE}, i.e. in separate threads. + +As with any collective routine on a team, team creation is matched across PEs based +on ordering. So, team creation events must occur in the same order on all \acp{PE} +in the parent team. + +Upon completion of a team creation operation, the parent and any resulting child teams +will be immediately usable for any team-based operations, including creating new child +teams, without any intervening synchronization. diff --git a/example_code/hybrid_mpi_mapping_id.c b/example_code/hybrid_mpi_mapping_id.c new file mode 100644 index 000000000..1e30b3879 --- /dev/null +++ b/example_code/hybrid_mpi_mapping_id.c @@ -0,0 +1,34 @@ +#include +#include +#include + +int main(int argc, char *argv[]) +{ + static long pSync[SHMEM_COLLECT_SYNC_SIZE]; + for (int i = 0; i < SHMEM_COLLECT_SYNC_SIZE; i++) + pSync[i] = SHMEM_SYNC_VALUE; + + MPI_Init(&argc, &argv); + shmem_init(); + + int mype = shmem_my_pe(); + int npes = shmem_n_pes(); + + static int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + int *mpi_ranks = shmem_calloc(npes, sizeof(int)); + + shmem_collect32(mpi_ranks, &myrank, 1, 0, 0, npes, pSync); + + if (mype == 0) + for (int i = 0; i < npes; i++) + printf("PE %d's MPI rank is %d\n", i, mpi_ranks[i]); + + shmem_free(mpi_ranks); + + shmem_finalize(); + MPI_Finalize(); + + return 0; +} diff --git a/example_code/shmem_alltoall_example.c b/example_code/shmem_alltoall_example.c index 13d11b75e..a5d102545 100644 --- a/example_code/shmem_alltoall_example.c +++ b/example_code/shmem_alltoall_example.c @@ -4,10 +4,6 @@ int main(void) { - static long pSync[SHMEM_ALLTOALL_SYNC_SIZE]; - for (int i = 0; i < SHMEM_ALLTOALL_SYNC_SIZE; i++) - pSync[i] = SHMEM_SYNC_VALUE; - shmem_init(); int me = shmem_my_pe(); int npes = shmem_n_pes(); @@ -23,11 +19,11 @@ int main(void) dest[(pe * count) + i] = 9999; } } - /* wait for all PEs to update source/dest */ - shmem_barrier_all(); + /* wait for all PEs to initialize source/dest */ + shmem_team_sync(SHMEM_TEAM_WORLD); /* alltoall on all PES */ - shmem_alltoall64(dest, source, count, 0, 0, npes, pSync); + shmem_int64_alltoall(SHMEM_TEAM_WORLD, dest, source, count); /* verify results */ for (int pe = 0; pe < npes; pe++) { diff --git a/example_code/shmem_alltoalls_example.c b/example_code/shmem_alltoalls_example.c index f78ff5687..5c135185a 100644 --- a/example_code/shmem_alltoalls_example.c +++ b/example_code/shmem_alltoalls_example.c @@ -4,10 +4,6 @@ int main(void) { - static long pSync[SHMEM_ALLTOALLS_SYNC_SIZE]; - for (int i = 0; i < SHMEM_ALLTOALLS_SYNC_SIZE; i++) - pSync[i] = SHMEM_SYNC_VALUE; - shmem_init(); int me = shmem_my_pe(); int npes = shmem_n_pes(); @@ -25,11 +21,11 @@ int main(void) dest[dst * ((pe * count) + i)] = 9999; } } - /* wait for all PEs to update source/dest */ - shmem_barrier_all(); + /* wait for all PEs to initialize source/dest */ + shmem_team_sync(SHMEM_TEAM_WORLD); /* alltoalls on all PES */ - shmem_alltoalls64(dest, source, dst, sst, count, 0, 0, npes, pSync); + shmem_int64_alltoalls(SHMEM_TEAM_WORLD, dest, source, dst, sst, count); /* verify results */ for (int pe = 0; pe < npes; pe++) { diff --git a/example_code/shmem_broadcast_example.c b/example_code/shmem_broadcast_example.c index a829448ea..8c0b84037 100644 --- a/example_code/shmem_broadcast_example.c +++ b/example_code/shmem_broadcast_example.c @@ -4,9 +4,6 @@ int main(void) { - static long pSync[SHMEM_BCAST_SYNC_SIZE]; - for (int i = 0; i < SHMEM_BCAST_SYNC_SIZE; i++) - pSync[i] = SHMEM_SYNC_VALUE; static long source[4], dest[4]; shmem_init(); @@ -17,7 +14,8 @@ int main(void) for (int i = 0; i < 4; i++) source[i] = i; - shmem_broadcast64(dest, source, 4, 0, 0, 0, npes, pSync); + shmem_broadcast(SHMEM_TEAM_WORLD, dest, source, 4, 0); + printf("%d: %ld, %ld, %ld, %ld\n", me, dest[0], dest[1], dest[2], dest[3]); shmem_finalize(); return 0; diff --git a/example_code/shmem_collect_example.c b/example_code/shmem_collect_example.c index b73733368..9eb569627 100644 --- a/example_code/shmem_collect_example.c +++ b/example_code/shmem_collect_example.c @@ -5,9 +5,6 @@ int main(void) { static long lock = 0; - static long pSync[SHMEM_COLLECT_SYNC_SIZE]; - for (int i = 0; i < SHMEM_COLLECT_SYNC_SIZE; i++) - pSync[i] = SHMEM_SYNC_VALUE; shmem_init(); int me = shmem_my_pe(); @@ -23,9 +20,10 @@ int main(void) for (int i = 0; i < total_nelem; i++) dest[i] = -9999; - shmem_barrier_all(); /* Wait for all PEs to update source/dest */ + /* Wait for all PEs to initialize source/dest: */ + shmem_team_sync(SHMEM_TEAM_WORLD); - shmem_collect32(dest, source, my_nelem, 0, 0, npes, pSync); + shmem_int_collect(SHMEM_TEAM_WORLD, dest, source, my_nelem); shmem_set_lock(&lock); /* Lock prevents interleaving printfs */ printf("%d: %d", me, dest[0]); diff --git a/example_code/shmem_put_signal_example.c b/example_code/shmem_put_signal_example.c new file mode 100644 index 000000000..179da6d26 --- /dev/null +++ b/example_code/shmem_put_signal_example.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +int main(void) +{ + int i, err_count = 0; + + shmem_init(); + + size_t size = 2048; + int me = shmem_my_pe(); + int n = shmem_n_pes(); + int pe = (me + 1) % n; + uint64_t * message = malloc(size * sizeof(uint64_t)); + static uint64_t sig_addr = 0; + + for (i = 0; i < size; i++) { + message[i] = me; + } + + uint64_t *data = shmem_calloc(size, sizeof(uint64_t)); + + if (me == 0) { + shmem_put_signal(data, message, size, &sig_addr, 1, SHMEM_SIGNAL_SET, pe); + } else { + shmem_wait_until(&sig_addr, SHMEM_CMP_EQ, 1); + shmem_put_signal(data, data, size, &sig_addr, 1, SHMEM_SIGNAL_SET, pe); + } + + free(message); + shmem_free(data); + + shmem_finalize(); + return 0; +} diff --git a/example_code/shmem_query_mpi_progress.c b/example_code/shmem_query_mpi_progress.c new file mode 100644 index 000000000..063c320f7 --- /dev/null +++ b/example_code/shmem_query_mpi_progress.c @@ -0,0 +1,34 @@ +#include +#include +#include + +int main(int argc, char *argv[]) +{ + MPI_Init(&argc, &argv); + shmem_init(); + + int mype = shmem_my_pe(); + + if (!shmem_query_interoperability(SHMEM_PROGRESS_MPI)) + shmem_global_exit(EXIT_FAILURE); + + int a[100]; + static int b = 0; + if (mype == 0) { + MPI_Request req = MPI_REQUEST_NULL; + MPI_Isend(a, 100, MPI_INT, 1, 0, MPI_COMM_WORLD, &req); + + while (shmem_int_atomic_fetch(&b, 0) != 1); + + MPI_Wait(req, MPI_STATUS_IGNORE); + } else { + MPI_Recv(a, 100, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + + shmem_int_atomic_set(&b, 1, 0); + } + + shmem_finalize(); + MPI_Finalize(); + + return 0; +} diff --git a/example_code/shmem_reduce_example.c b/example_code/shmem_reduce_example.c new file mode 100644 index 000000000..24f8cbb2b --- /dev/null +++ b/example_code/shmem_reduce_example.c @@ -0,0 +1,57 @@ +#include +#include +#include + +/* As if we receive some value from external source */ +long recv_a_value(unsigned seed, int npes) { + srand(seed); + return rand() % npes; +} + +/* Validate the value we recieved */ +unsigned char is_valid(long value, int npes) { + if (value == (npes-1)) + return 0; + return 1; +} + +int main(void) +{ + + shmem_init(); + int me = shmem_my_pe(); + int npes = shmem_n_pes(); + size_t num = 32; + + long *values = shmem_malloc(num * sizeof(long)); + long *sums = shmem_malloc(num * sizeof(long)); + + unsigned char *valid_me = shmem_malloc(num * sizeof(unsigned char)); + unsigned char *valid_all = shmem_malloc(num * sizeof(unsigned char)); + + values[0] = recv_a_value((unsigned)me, npes); + valid_me[0] = is_valid(values[0], npes); + + for (int i=1; i < num; i++) { + values[i] = recv_a_value((unsigned)values[i-1], npes); + valid_me[i] = is_valid(values[i], npes); + } + + /* Wait for all PEs to initialize reductions arrays */ + shmem_sync(SHMEM_TEAM_WORLD); + + shmem_and_reduce(SHMEM_TEAM_WORLD, valid_all, valid_me, num); + shmem_sum_reduce(SHMEM_TEAM_WORLD, sums, values, num); + + for (int i=0; i < num; i++) { + if (valid_all[i]) { + printf ("[%d] = %ld\n", i, sums[i]); + } + else { + printf ("[%d] = invalid on one or more pe\n", i); + } + } + + shmem_finalize(); + return 0; +} diff --git a/example_code/shmem_sync_example.c b/example_code/shmem_sync_example.c index 8c447beee..2e367a428 100644 --- a/example_code/shmem_sync_example.c +++ b/example_code/shmem_sync_example.c @@ -4,25 +4,50 @@ int main(void) { static int x = 10101; - static long pSync[SHMEM_BARRIER_SYNC_SIZE]; + + shmem_team_t twos_team, threes_team; + shmem_team_config_t *config; shmem_init(); - int me = shmem_my_pe(); + config = NULL; + int me = shmem_my_pe(); int npes = shmem_n_pes(); - for (int i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++) - pSync[i] = SHMEM_SYNC_VALUE; + int odd_npes = npes % 2; + + shmem_team_split_strided(SHMEM_TEAM_WORLD, 0, 2, npes / 2, config, 0, + &twos_team); + + shmem_team_split_strided(SHMEM_TEAM_WORLD, 0, 3, npes / 3 + odd_npes, + config, 0, &threes_team); - shmem_sync_all(); + int my_pe_twos = shmem_team_my_pe(twos_team); + int my_pe_threes = shmem_team_my_pe(threes_team); + + if (twos_team != SHMEM_TEAM_INVALID) { + /* put the value 2 to the next team member in a circular fashion */ + shmem_p(&x, 2, (me + 2) % npes); + shmem_quiet(); + shmem_sync(twos_team); + } - if (me % 2 == 0) { - /* put to next even PE in a circular fashion */ - shmem_p(&x, 4, (me + 2) % npes); - /* synchronize all even pes */ + if (threes_team != SHMEM_TEAM_INVALID) { + /* put the value 3 to the next team member in a circular fashion */ + shmem_p(&x, 3, (me + 3) % npes); shmem_quiet(); - shmem_sync(0, 1, (npes / 2 + npes % 2), pSync); + shmem_sync(threes_team); } - printf("%d: x = %d\n", me, x); + + if (me % 3 == 0 && x != 3) { + shmem_global_exit(3); + } + else if (me % 2 == 0 && x != 2) { + shmem_global_exit(2); + } + else if (x != 10101) { + shmem_global_exit(1); + } + shmem_finalize(); return 0; } diff --git a/example_code/shmem_team_context.c b/example_code/shmem_team_context.c new file mode 100644 index 000000000..57d8c9621 --- /dev/null +++ b/example_code/shmem_team_context.c @@ -0,0 +1,112 @@ +#include +#include + +int isum, ival; + +int my_ctx_translate_pe(shmem_ctx_t src_ctx, int src_pe, shmem_ctx_t dest_ctx) +{ + if (src_ctx == SHMEM_CTX_INVALID) { + return -1; + } + if (dest_ctx == SHMEM_CTX_INVALID) { + return -1; + } + + shmem_team_t src_team, dest_team; + shmem_ctx_get_team(src_ctx, &src_team); + shmem_ctx_get_team(dest_ctx, &dest_team); + return shmem_team_translate_pe(src_team, src_pe, dest_pe); +} + +shmem_ctx_t my_team_create_ctx(shmem_team_t team) { + if (team == SHMEM_TEAM_INVALID) { + return SHMEM_CTX_INVALID; + } + + shmem_ctx_t ctx; + if (shmem_team_create_ctx(team, 0, &ctx) != 0) { + fprintf (stderr, "Failed to create context for PE team\n"); + return SHMEM_CTX_INVALID; + } + return ctx; +} + +void my_send_to_neighbor(shmem_ctx_t ctx, int *val) +{ + if (ctx == SHMEM_CTX_INVALID) { + fprintf (stderr, "Send to neighbor fail due to invalid context\n"); + return; + } + + shmem_team_t team; + shmem_ctx_get_team(ctx, &team); + int pe = shmem_team_my_pe(team); + int npes = shmem_team_n_pes(team); + int rpe = (pe + 1) % npes; + + // put my pe number in the buffer on my right hand neighbor + shmem_ctx_int_put(ctx, val, &pe, 1, rpe); +} + + + +int main() +{ + shmem_init(); + + int npes = shmem_n_pes(); + isum = 0; + + shmem_team_t team_2s, team_3s; + shmem_ctx_t ctx_2s, ctx_3s; + shmem_team_config_t conf; + conf.num_contexts = 1; + long cmask = SHMEM_TEAM_NUM_CONTEXTS; + + // Create team with PEs numbered 0, 2, 4, ... + shmem_team_spit_strided(SHMEM_TEAM_WORLD, 0, 2, npes / 2, &conf, cmask, &team_2s); + // Create team with PEs numbered 0, 3, 6, ... + shmem_team_split_strided(SHMEM_TEAM_WORLD, 0, 3, npes / 3, &conf, cmask, &team_3s); + + ctx_2s = my_team_create_ctx(team_2s); + ctx_3s = my_team_create_ctx(team_3s); + + // Send some values using the two team contexts contexts + my_send_to_neighbor(ctx_2s, &ival2); + my_send_to_neighbor(ctx_3s, &ival3); + + // Quiet all contexts and synchronize all PEs to complete the data transfers + shmem_ctx_quiet(ctx_2s); + shmem_ctx_quiet(ctx_3s); + shmem_team_sync(SHMEM_TEAM_WORLD); + + // We will add up some results on pe 4 of team_3s using ctx_2s + if ((team_3s != SHMEM_TEAM_INVALID) && (team_2s != SHMEM_TEAM_INVALID)) { + int pe4_of_3s_in_2s = my_ctx_translate_pe(ctx_3s, 4, ctx_2s); + + if (pe4_of_3s_in_2s < 0) { + fprintf (stderr, "Fail to translate pe 4 from 3s context to 2s context\n"); + } + else { + // Add up the results on pe 4 of the 3s team, using the 2s team context + shmem_ctx_int_atomic_add(ctx_2s, &isum, ival2 + ival3, _pe4_of_3s_in_2s); + } + } + + // Quiet the context and synchronize PEs to complete the operation + shmem_ctx_quiet(ctx_2s); + shmem_team_sync(SHMEM_TEAM_WORLD); + + if (shmem_team_my_pe(team_3s) == 4) { + printf ("The total value on PE 4 of the 3s team is %d\n", isum); + } + + // Destroy contexts before teams + shmem_ctx_destroy(ctx_2s); + shmem_team_destroy(team_2s); + + shmem_ctx_destroy(ctx_3s); + shmem_team_destroy(team_3s); + + shmem_finalize(); +} diff --git a/example_code/shmem_team_split_2D.c b/example_code/shmem_team_split_2D.c new file mode 100644 index 000000000..2059f3279 --- /dev/null +++ b/example_code/shmem_team_split_2D.c @@ -0,0 +1,42 @@ +#include +#include + +int main(void) +{ + int xdim = 3; + int ydim = 4; + + shmem_init(); + int pe = shmem_my_pe(); + int npes = shmem_n_pes(); + + if (npes < (xdim*ydim)) { + printf ("Not enough PEs to create 4x3xN layout\n"); + exit(1); + } + + int zdim = (npes / (xdim*ydim)) + ( ((npes % (xdim*ydim)) > 0) ? 1 : 0 ); + shmem_team_t xteam, yzteam, yteam, zteam; + + shmem_team_split_2d(SHMEM_TEAM_WORLD, xdim, NULL, 0, &xteam, NULL, 0, &yzteam); + // yzteam is immediately ready to be used in collectives + shmem_team_split_2d(yzteam, ydim, NULL, 0, &yteam, NULL, 0, &zteam); + + // We don't need the yzteam anymore + shmem_team_destroy(yzteam); + + int my_x = shmem_team_my_pe(xteam); + int my_y = shmem_team_my_pe(yteam); + int my_z = shmem_team_my_pe(zteam); + + for (int zdx = 0; zdx < zdim; zdx++) + for (int ydx = 0; ydx < ydim; ydx++) + for (int xdx = 0; xdx < xdim; xdx++) { + if ((my_x == xdx) && (my_y == ydx) && (my_z == zdx)) { + printf ("(%d, %d, %d) is me = %d\n", my_x, my_y, my_z, pe); + } + shmem_team_sync(SHMEM_TEAM_WORLD); + } + + shmem_finalize(); +} diff --git a/example_code/shmem_team_split_strided.c b/example_code/shmem_team_split_strided.c new file mode 100644 index 000000000..42973ba57 --- /dev/null +++ b/example_code/shmem_team_split_strided.c @@ -0,0 +1,35 @@ +/* + * OpenSHMEM shmem_team_split_strided example to create a team of all even + * ranked PEs from SHMEM_TEAM_WORLD + */ + +#include +#include + +int main(int argc, char *argv[]) +{ + int rank, npes; + int t_pe, t_size; + shmem_team_t new_team; + shmem_team_config_t *config; + + shmem_init(); + config = NULL; + rank = shmem_my_pe(); + npes = shmem_n_pes(); + + shmem_team_split_strided(SHMEM_TEAM_WORLD, 0, 2, npes / 2, config, 0, + &new_team); + + if (new_team != SHMEM_TEAM_INVALID) { + t_size = shmem_team_n_pes(new_team); + t_pe = shmem_team_my_pe(new_team); + + if ((rank % 2 != 0) || (rank / 2 != t_pe) || (npes / 2 != t_size)) { + shmem_global_exit(1); + } + } + + shmem_finalize(); + return 0; +} diff --git a/example_code/shmem_team_translate_pe.c b/example_code/shmem_team_translate_pe.c new file mode 100644 index 000000000..15aec0a6e --- /dev/null +++ b/example_code/shmem_team_translate_pe.c @@ -0,0 +1,32 @@ +#include +#include + +int main(void) +{ + int my_pe; + int n_pes; + int t_pe; + int t_global; + shmem_team_t new_team; + shmem_team_config_t *config; + + shmem_init(); + config = NULL; + my_pe = shmem_my_pe(); + n_pes = shmem_n_pes(); + + shmem_team_split_strided(SHMEM_TEAM_WORLD, 0, 2, (n_pes + 1) / 2, + config, 0, &new_team); + + if (new_team != SHMEM_TEAM_INVALID) { + t_pe = shmem_team_my_pe(new_team); + t_global = shmem_team_translate_pe(new_team, t_pe, SHMEM_TEAM_WORLD); + + if (t_global != my_pe) { + shmem_global_exit(1); + } + } + + shmem_finalize(); + return 0; +} diff --git a/example_code/shmem_test_any_example.c b/example_code/shmem_test_any_example.c index 144d2f0eb..64c784c3e 100644 --- a/example_code/shmem_test_any_example.c +++ b/example_code/shmem_test_any_example.c @@ -11,7 +11,7 @@ int main(void) int *status = calloc(npes, sizeof(int)); for (int i = 0; i < npes; i++) - shmem_p(&flags[mype], 1, i); + shmem_atomic_set(&flags[mype], 1, i); int ncompleted = 0; size_t completed_idx; diff --git a/example_code/shmem_test_example1.c b/example_code/shmem_test_example1.c index d3f760f44..8c0f5c9e9 100644 --- a/example_code/shmem_test_example1.c +++ b/example_code/shmem_test_example1.c @@ -22,7 +22,7 @@ int main(void) printf("PE %d observed first update from PE %d\n", mype, who); } else - shmem_p(&wait_vars[mype], mype, 0); + shmem_atomic_set(&wait_vars[mype], mype, 0); shmem_free(wait_vars); shmem_finalize(); diff --git a/example_code/shmem_test_some_example.c b/example_code/shmem_test_some_example.c index 27cfb9db3..9c73a6cdc 100644 --- a/example_code/shmem_test_some_example.c +++ b/example_code/shmem_test_some_example.c @@ -27,12 +27,12 @@ int main(void) shmem_fence(); for (int i = 0; i < npes; i++) - shmem_p(&flags[mype], 1, i); + shmem_atomic_set(&flags[mype], 1, i); int ncompleted = 0; while (ncompleted < npes) { - int ntested = shmem_test_some(flags, npes, indices, status, SHMEM_CMP_NE, 0); + int ntested = shmem_test_some(flags, npes, indices, status, SHMEM_CMP_NE, 0); if (ntested > 0) { for (int i = 0; i < ntested; i++) { for (int j = 0; j < N; j++) { diff --git a/example_code/shmem_wait3_example.c b/example_code/shmem_wait3_example.c deleted file mode 100644 index 2a6c3ee04..000000000 --- a/example_code/shmem_wait3_example.c +++ /dev/null @@ -1,8 +0,0 @@ -#include -#include - -int ivar; - -void wait_on_ivar(void) { - shmem_int_wait_until(&ivar, SHMEM_CMP_LT, 0); -} diff --git a/example_code/shmem_wait_until_all.c b/example_code/shmem_wait_until_all.c index 8d530cf6d..ed2eacaf7 100644 --- a/example_code/shmem_wait_until_all.c +++ b/example_code/shmem_wait_until_all.c @@ -10,7 +10,7 @@ int main(void) int *status = NULL; for (int i = 0; i < npes; i++) - shmem_p(&flags[mype], 1, i); + shmem_atomic_set(&flags[mype], 1, i); shmem_wait_until_all(flags, npes, status, SHMEM_CMP_EQ, 1); diff --git a/example_code/shmem_wait_until_any_all2all_sum.c b/example_code/shmem_wait_until_any_all2all_sum.c index eb139071f..3317fec6a 100644 --- a/example_code/shmem_wait_until_any_all2all_sum.c +++ b/example_code/shmem_wait_until_any_all2all_sum.c @@ -26,7 +26,7 @@ int main(void) shmem_fence(); for (int i = 0; i < npes; i++) - shmem_p(&flags[mype], 1, i); + shmem_atomic_set(&flags[mype], 1, i); for (int i = 0; i < npes; i++) { size_t completed_idx = shmem_wait_until_any(flags, npes, status, SHMEM_CMP_NE, 0); diff --git a/example_code/shmem_wait_until_any_vector.c b/example_code/shmem_wait_until_any_vector.c new file mode 100644 index 000000000..266585e51 --- /dev/null +++ b/example_code/shmem_wait_until_any_vector.c @@ -0,0 +1,42 @@ +#include +#include + +#define N 100 + +int main(void) +{ + int total_sum = 0; + + shmem_init(); + int mype = shmem_my_pe(); + int npes = shmem_n_pes(); + + int *ivars = shmem_calloc(npes, sizeof(int)); + int *status = calloc(npes, sizeof(int)); + int *cmp_values = malloc(npes * sizeof(int)); + + /* All odd PEs put 2 and all even PEs put 1 */ + for (int i = 0; i < npes; i++) { + shmem_atomic_set(&ivars[mype], mype % 2 + 1, i); + + /* Set cmp_values to the expected values coming from each PE */ + cmp_values[i] = i % 2 + 1; + } + + for (int i = 0; i < npes; i++) { + size_t completed_idx = shmem_wait_until_any_vector(ivars, npes, status, + SHMEM_CMP_EQ, cmp_values); + status[completed_idx] = 1; + total_sum += ivars[completed_idx]; + } + + /* check the result */ + int correct_result = npes + npes / 2; + + if (total_sum != correct_result) { + shmem_global_exit(1); + } + + shmem_finalize(); + return 0; +} diff --git a/example_code/shmem_wait_until_some_all2all_sum.c b/example_code/shmem_wait_until_some_all2all_sum.c index 83caa6c5c..1c1a2f951 100644 --- a/example_code/shmem_wait_until_some_all2all_sum.c +++ b/example_code/shmem_wait_until_some_all2all_sum.c @@ -27,7 +27,7 @@ int main(void) shmem_fence(); for (int i = 0; i < npes; i++) - shmem_p(&flags[mype], 1, i); + shmem_atomic_set(&flags[mype], 1, i); size_t ncompleted; while ((ncompleted = shmem_wait_until_some(flags, npes, indices, diff --git a/main_spec.tex b/main_spec.tex index f38c72d4b..e0750872f 100644 --- a/main_spec.tex +++ b/main_spec.tex @@ -34,9 +34,6 @@ \section{Library Handles}\label{subsec:library_handles} \section{Environment Variables }\label{subsec:environment_variables} \input{content/environment_variables} - - - \clearpage @@ -106,6 +103,38 @@ \subsubsection{\textbf{SHMEM\_MALLOC, SHMEM\_FREE, SHMEM\_REALLOC, SHMEM\_ALIGN} \subsubsection{\textbf{SHMEM\_CALLOC}}\label{subsec:shmem_calloc} \input{content/shmem_calloc.tex} + + +\subsection{Team Management Routines}\label{subsec:team} +\input{content/teams_intro.tex} + +\subsubsection{\textbf{SHMEM\_TEAM\_MY\_PE}}\label{subsec:shmem_team_my_pe} +\input{content/shmem_team_my_pe.tex} + +\subsubsection{\textbf{SHMEM\_TEAM\_N\_PES}}\label{subsec:shmem_team_n_pes} +\input{content/shmem_team_n_pes.tex} + +\subsubsection{\textbf{SHMEM\_TEAM\_CONFIG\_T}} +\label{subsec:shmem_team_config_t} +\input{content/shmem_team_config_t.tex} + +\subsubsection{\textbf{SHMEM\_TEAM\_GET\_CONFIG}}\label{subsec:shmem_team_get_config} +\input{content/shmem_team_get_config.tex} + +\subsubsection{\textbf{SHMEM\_TEAM\_TRANSLATE\_PE}}\label{subsec:shmem_team_translate_pe} +\input{content/shmem_team_translate_pe.tex} + +\subsubsection{\textbf{SHMEM\_TEAM\_SPLIT\_STRIDED}}\label{subsec:shmem_team_split_strided} +\input{content/shmem_team_split_strided.tex} + +\subsubsection{\textbf{SHMEM\_TEAM\_SPLIT\_2D}}\label{subsec:shmem_team_split_2d} +\input{content/shmem_team_split_2d.tex} + +\subsubsection{\textbf{SHMEM\_TEAM\_DESTROY}}\label{subsec:shmem_team_destroy} +\input{content/shmem_team_destroy.tex} + + + \subsection{Communication Management Routines} \label{sec:ctx} All \openshmem \ac{RMA}, \ac{AMO}, and memory ordering routines must be @@ -139,14 +168,37 @@ \subsection{Communication Management Routines} When managed in this way, applications can use an equality comparison to test whether a given context handle references a valid context. +Every communication context is associated with a team. +This association is established at context creation. +Communication contexts created by \FUNC{shmem\_ctx\_create} are +associated with the default team, while contexts created by +\FUNC{shmem\_team\_create\_ctx} are associated with and created from a team +specified at context creation. +The default context is associated with the default team. +A context's associated team specifies the set of \acp{PE} over which +\ac{PE}-specific routines that operate on a communication context, +explicitly or implicitly, are performed. +All point-to-point routines that operate on this context will do so with +respect to the team-relative \ac{PE} numbering of the associated team. +If the PE number passed to such a routine is invalid, being negative or greater +than or equal to the size of the \openshmem team, then the behavior is undefined. + \subsubsection{\textbf{SHMEM\_CTX\_CREATE}} \label{subsec:shmem_ctx_create} \input{content/shmem_ctx_create.tex} +\subsubsection{\textbf{SHMEM\_TEAM\_CREATE\_CTX}} +\label{subsec:shmem_team_create_ctx} +\input{content/shmem_team_create_ctx.tex} + \subsubsection{\textbf{SHMEM\_CTX\_DESTROY}} \label{subsec:shmem_ctx_destroy} \input{content/shmem_ctx_destroy.tex} +\subsubsection{\textbf{SHMEM\_CTX\_GET\_TEAM}} +\label{subsec:shmem_ctx_get_team} +\input{content/shmem_ctx_get_team.tex} + \subsection{Remote Memory Access Routines}\label{sec:rma} \input{content/rma_intro.tex} @@ -169,7 +221,7 @@ \subsubsection{\textbf{SHMEM\_G}}\label{subsec:shmem_g} \subsubsection{\textbf{SHMEM\_IGET}}\label{subsec:shmem_iget} \input{content/shmem_iget.tex} -\subsection{Non-blocking Remote Memory Access Routines}\label{sec:rma_nbi} +\subsection{Nonblocking Remote Memory Access Routines}\label{sec:rma_nbi} \subsubsection{\textbf{SHMEM\_PUT\_NBI}}\label{subsec:shmem_put_nbi} \input{content/shmem_put_nbi.tex} @@ -178,6 +230,7 @@ \subsubsection{\textbf{SHMEM\_GET\_NBI}}\label{subsec:shmem_get_nbi} \input{content/shmem_get_nbi.tex} + \subsection{Atomic Memory Operations}\label{sec:amo} \input{content/atomics_intro} @@ -237,6 +290,97 @@ \subsubsection{\textbf{SHMEM\_ATOMIC\_XOR}} \label{subsec:shmem_atomic_xor} \input{content/shmem_atomic_xor.tex} +\subsection{Nonblocking Atomic Memory Operations}\label{sec:amo-nbi} + +\subsubsection{\textbf{SHMEM\_ATOMIC\_FETCH\_NBI}} +\label{subsec:shmem_atomic_fetch_nbi} +\input{content/shmem_atomic_fetch_nbi.tex} + +\subsubsection{\textbf{SHMEM\_ATOMIC\_COMPARE\_SWAP\_NBI}} +\label{subsec:shmem_atomic_compare_swap_nbi} +\input{content/shmem_atomic_compare_swap_nbi.tex} + +\subsubsection{\textbf{SHMEM\_ATOMIC\_SWAP\_NBI}} +\label{subsec:shmem_atomic_swap_nbi} +\input{content/shmem_atomic_swap_nbi.tex} + +\subsubsection{\textbf{SHMEM\_ATOMIC\_FETCH\_INC\_NBI}} +\label{subsec:shmem_atomic_fetch_inc_nbi} +\input{content/shmem_atomic_fetch_inc_nbi.tex} + +\subsubsection{\textbf{SHMEM\_ATOMIC\_FETCH\_ADD\_NBI}} +\label{subsec:shmem_atomic_fetch_add_nbi} +\input{content/shmem_atomic_fetch_add_nbi.tex} + +\subsubsection{\textbf{SHMEM\_ATOMIC\_FETCH\_AND\_NBI}} +\label{subsec:shmem_atomic_fetch_and_nbi} +\input{content/shmem_atomic_fetch_and_nbi.tex} + +\subsubsection{\textbf{SHMEM\_ATOMIC\_FETCH\_OR\_NBI}} +\label{subsec:shmem_atomic_fetch_or_nbi} +\input{content/shmem_atomic_fetch_or_nbi.tex} + +\subsubsection{\textbf{SHMEM\_ATOMIC\_FETCH\_XOR\_NBI}} +\label{subsec:shmem_atomic_fetch_xor_nbi} +\input{content/shmem_atomic_fetch_xor_nbi.tex} + + + +\subsection{Signaling Operations}\label{sec:shmem_signal} +This section specifies the OpenSHMEM support for \OPR{put-with-signal}, +nonblocking \OPR{put-with-signal}, and \OPR{signal-fetch} routines. The +put-with-signal routines provide a method for copying data from a contiguous +local data object to a data object on a specified \ac{PE} and subsequently +updating a remote flag to signal completion. The signal-fetch routine provides +support for fetching a signal update operation. + +\subsubsection{Atomicity Guarantees for Signaling Operations} +\label{subsec:signal_atomicity} +All signaling operations put-with-signal, nonblocking put-with-signal, and +signal-fetch are performed on a signal data object, a remotely accessible +symmetric object of type \VAR{uint64\_t}. A signal operator in the +put-with-signal routine is a \openshmem library constant that determines the +type of update to be performed as a signal on the signal data object. + +All signaling operations on the signal data object completes as if performed +atomically with respect to the following: +\begin{itemize} + \item other blocking or nonblocking variant of the put-with-signal routine + that updates the signal data object using the same signal update operator; + \item signal-fetch routine that fetches the signal data object; and + \item any point-to-point synchronization routine that accesses the signal + data object. +\end{itemize} + +\subsubsection{Available Signal Operators} +\label{subsec:signal_operator} + +With the atomicity guarantees as described in +Section~\ref{subsec:signal_atomicity}, the following options can be used as a +signal operator. + + \apitablerow{\LibConstRef{SHMEM\_SIGNAL\_SET}}{An update to signal data + object is an atomic set operation. It writes an unsigned 64-bit value as a + signal into the signal data object on a remote \VAR{PE} as an atomic + operation.} + + \apitablerow{\LibConstRef{SHMEM\_SIGNAL\_ADD}}{An update to signal data + object is an atomic add operation. It adds an unsigned 64-bit value as a + signal into the signal data object on a remote \VAR{PE} as an atomic + operation.} + + +\subsubsection{\textbf{SHMEM\_PUT\_SIGNAL}}\label{subsec:shmem_put_signal} +\input{content/shmem_put_signal.tex} + +\subsubsection{\textbf{SHMEM\_PUT\_SIGNAL\_NBI}}\label{subsec:shmem_put_signal_nbi} +\input{content/shmem_put_signal_nbi.tex} + +\subsubsection{\textbf{SHMEM\_SIGNAL\_FETCH}}\label{subsec:shmem_signal_fetch} +\input{content/shmem_signal_fetch.tex} + + + \subsection{Collective Routines}\label{subsec:coll} \input{content/collective_intro.tex} @@ -246,12 +390,12 @@ \subsubsection{\textbf{SHMEM\_BARRIER\_ALL}}\label{subsec:shmem_barrier_all} \subsubsection{\textbf{SHMEM\_BARRIER}}\label{subsec:shmem_barrier} \input{content/shmem_barrier.tex} -\subsubsection{\textbf{SHMEM\_SYNC\_ALL}}\label{subsec:shmem_sync_all} -\input{content/shmem_sync_all.tex} - \subsubsection{\textbf{SHMEM\_SYNC}}\label{subsec:shmem_sync} \input{content/shmem_sync.tex} +\subsubsection{\textbf{SHMEM\_SYNC\_ALL}}\label{subsec:shmem_sync_all} +\input{content/shmem_sync_all.tex} + \subsubsection{\textbf{SHMEM\_BROADCAST}}\label{subsec:shmem_broadcast} \input{content/shmem_broadcast.tex} @@ -270,7 +414,6 @@ \subsubsection{\textbf{SHMEM\_ALLTOALLS}}\label{subsec:shmem_alltoalls} - \subsection{Point-To-Point Synchronization Routines}\label{subsec:p2p_intro} \input{content/p2p_sync_intro.tex} @@ -286,6 +429,15 @@ \subsubsection{\textbf{SHMEM\_WAIT\_UNTIL\_ANY}}\label{subsec:shmem_wait_until_a \subsubsection{\textbf{SHMEM\_WAIT\_UNTIL\_SOME}}\label{subsec:shmem_wait_until_some} \input{content/shmem_wait_until_some.tex} +\subsubsection{\textbf{SHMEM\_WAIT\_UNTIL\_ALL\_VECTOR}}\label{subsec:shmem_wait_until_all_vector} +\input{content/shmem_wait_until_all_vector.tex} + +\subsubsection{\textbf{SHMEM\_WAIT\_UNTIL\_ANY\_VECTOR}}\label{subsec:shmem_wait_until_any_vector} +\input{content/shmem_wait_until_any_vector.tex} + +\subsubsection{\textbf{SHMEM\_WAIT\_UNTIL\_SOME\_VECTOR}}\label{subsec:shmem_wait_until_some_vector} +\input{content/shmem_wait_until_some_vector.tex} + \subsubsection{\textbf{SHMEM\_TEST}}\label{subsec:shmem_test} \input{content/shmem_test.tex} @@ -298,13 +450,26 @@ \subsubsection{\textbf{SHMEM\_TEST\_ANY}}\label{subsec:shmem_test_any} \subsubsection{\textbf{SHMEM\_TEST\_SOME}}\label{subsec:shmem_test_some} \input{content/shmem_test_some.tex} +\subsubsection{\textbf{SHMEM\_TEST\_ALL\_VECTOR}}\label{subsec:shmem_test_all_vector} +\input{content/shmem_test_all_vector.tex} + +\subsubsection{\textbf{SHMEM\_TEST\_ANY\_VECTOR}}\label{subsec:shmem_test_any_vector} +\input{content/shmem_test_any_vector.tex} + +\subsubsection{\textbf{SHMEM\_TEST\_SOME\_VECTOR}}\label{subsec:shmem_test_some_vector} +\input{content/shmem_test_some_vector.tex} + +\subsubsection{\textbf{SHMEM\_SIGNAL\_WAIT\_UNTIL}}\label{subsec:shmem_signal_wait_until} +\input{content/shmem_signal_wait_until.tex} \subsection{Memory Ordering Routines}\label{subsec:memory_order} The following section discusses \openshmem \acp{API} that provide mechanisms to -ensure ordering and/or delivery of \OPR{Put}, \ac{AMO}, memory store, -and non-blocking \PUT{} and \GET{} routines to symmetric data objects. +ensure ordering and/or delivery of memory store, blocking \OPR{Put}, \ac{AMO}, +and \OPR{put-with-signal}, as well as nonblocking \PUT{}, +\OPR{put-with-signal}, \GET{}, and \ac{AMO} routines to symmetric data +objects. \subsubsection{\textbf{SHMEM\_FENCE}}\label{subsec:shmem_fence} \input{content/shmem_fence.tex} diff --git a/utils/defs.tex b/utils/defs.tex index bae063fe0..f5a2235b7 100644 --- a/utils/defs.tex +++ b/utils/defs.tex @@ -359,7 +359,6 @@ {\strikeline\mbox{} \DeprecationStart \stretchline\mbox{}}} \newcommand{\EndDeprecateBlock}{% \mbox{}\stretchline\mbox{} \DeprecationEnd \strikeline} - \newenvironment{DeprecateBlock}{% \par \StartDeprecateBlock \par}{\par \EndDeprecateBlock \par} @@ -369,16 +368,25 @@ \strikeline\mbox{} \DeprecationEnd \strikeline} \newenvironment{DeprecateInline}{\StartInlineDeprecate}{\EndInlineDeprecate} +\newcommand{\deprecationstart}{% + \color{red} \strikeline\mbox{} deprecation start \stretchline \mbox{}} +\newcommand{\deprecationend}{% + \mbox{}\stretchline\mbox{} \color{red} deprecation end \strikeline} +\newenvironment{deprecate}{\deprecationstart \\}{\\ \deprecationend} + % -% Library API description template commands +% Design feedback request helpers % -\newcommand{\deprecationstart}{\color{red} \raisebox{.5ex}{\rule{1em}{.4pt}} - deprecation start \xrfill[.5ex]{.4pt}[red] \mbox{}} -\newcommand{\deprecationend}{\mbox{}\xrfill[.5ex]{.4pt}[red]\mbox{} \color{red} - deprecation end \raisebox{.5ex}{\rule{1em}{.4pt}}} +\newcommand{\feedbackstart}{\color{RoyalBlue} \strikeline[RoyalBlue] + design feedback requested \stretchline[RoyalBlue] \mbox{}} +\newcommand{\feedbackend}{\mbox{}\stretchline[RoyalBlue]\mbox{}} -\newenvironment{deprecate}{\deprecationstart \\}{\\ \deprecationend} +\newenvironment{FeedbackRequest}{\feedbackstart \\}{\\ \feedbackend} + +% +% Library API description template commands +% \newcommand{\apisummary}[1]{ #1 @@ -407,14 +415,16 @@ \textbf{C11:} \lstset{language={C}, backgroundcolor=\color{gray}, lineskip=2pt, escapechar=@, - morekeywords={size_t, ptrdiff_t, TYPE, _Noreturn, shmem_ctx_t}, + morekeywords={size_t, ptrdiff_t, TYPE, _Noreturn, shmem_ctx_t, + shmem_team_t, shmem_team_config_t, uint64_t}, aboveskip=0pt, belowskip=0pt}}{} \lstnewenvironment{CsynopsisCol} { \lstset{language={C}, backgroundcolor=\color{gray}, lineskip=2pt, escapechar=@, - morekeywords={size_t, ptrdiff_t, TYPE, TYPENAME, SIZE, shmem_ctx_t}, + morekeywords={size_t, ptrdiff_t, TYPE, TYPENAME, SIZE, shmem_ctx_t, + shmem_team_t, shmem_team_config_t, uint64_t}, aboveskip=0pt, belowskip=0pt}}{} @@ -423,7 +433,8 @@ \textbf{C/C++:} \lstset{language={C}, backgroundcolor=\color{gray}, lineskip=2pt, escapechar=@, - morekeywords={size_t, ptrdiff_t, TYPE, TYPENAME, SIZE, shmem_ctx_t}, + morekeywords={size_t, ptrdiff_t, TYPE, TYPENAME, SIZE, shmem_ctx_t, + shmem_team_t, shmem_team_config_t, uint64_t}, aboveskip=0pt, belowskip=0pt}}{} \lstnewenvironment{CsynopsisST} @@ -432,7 +443,8 @@ \color{red} {\lstset{language={C}, backgroundcolor=\color{gray}, lineskip=2pt, escapechar=@, - morekeywords={size_t, ptrdiff_t, TYPE, TYPENAME, SIZE, shmem_ctx_t}, + morekeywords={size_t, ptrdiff_t, TYPE, TYPENAME, SIZE, shmem_ctx_t, + shmem_team_t, uint64_t}, aboveskip=0pt, belowskip=0pt}}}{} \lstnewenvironment{Fsynopsis} @@ -515,7 +527,7 @@ ##1 \lstinputlisting[language={C}, tabsize=2, basicstyle=\ttfamily\footnotesize, - morekeywords={size_t, ptrdiff_t, shmem_ctx_t, _Thread_local}]{##2} + morekeywords={size_t, ptrdiff_t, shmem_ctx_t, _Thread_local, shmem_team_t, uint64_t}]{##2} ##3 } \newcommand{\apifexample}[3]{ ##1 diff --git a/utils/packages.tex b/utils/packages.tex index 48ca5a34d..d3f54c6c4 100644 --- a/utils/packages.tex +++ b/utils/packages.tex @@ -3,10 +3,12 @@ \usepackage[utf8]{inputenc} \usepackage{graphicx} \usepackage{multicol} +\usepackage{multirow} \usepackage[normalem]{ulem} \usepackage{float} \usepackage[usenames,dvipsnames]{color} \usepackage{amsmath} +\usepackage{amsfonts} \usepackage[table]{xcolor} \usepackage{xspace} \usepackage{xhfill} @@ -15,7 +17,6 @@ \usepackage{listings} % note sure after here \usepackage{makeidx} -\usepackage{amsmath} \usepackage[UKenglish]{isodate} \usepackage{ifthen} \usepackage{textcomp}