From a4e22d7c7050008cdbbf845bb892425e87849428 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Tue, 21 Oct 2025 14:30:10 -0500 Subject: [PATCH 01/51] Retain coll info when adding a callback to the event queue Signed-off-by: Matthew Whitlock --- ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c b/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c index 73ef2f06374..6f954166006 100644 --- a/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c +++ b/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c @@ -1930,6 +1930,7 @@ static void *era_error_event_cb(int fd, int flags, void *context) { ompi_coll_ftagree_era_agreement_info_t* ci = event->ci; free(event); era_mark_process_failed(ci, r); + OBJ_RELEASE(ci); return NULL; } @@ -1948,6 +1949,7 @@ static void era_mark_process_failed(ompi_coll_ftagree_era_agreement_info_t *ci, event->rank = rank; opal_event_evtimer_set(opal_sync_event_base, &event->ev, era_error_event_cb, event); opal_event_add(&event->ev, &now); + OBJ_RETAIN(ci); return; } From 846e22f90488904e51b469d52906a857f05b7ec9 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Fri, 24 Oct 2025 14:41:17 +0000 Subject: [PATCH 02/51] update VERSION file Signed-off-by: Edgar Gabriel --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 9baf32a6304..44eb7329d5d 100644 --- a/VERSION +++ b/VERSION @@ -15,7 +15,7 @@ # major, minor, and release are generally combined in the form # ... -major=5 +major=6 minor=1 release=0 From 80f8cfbe835cdb555959a33f9790823483a0fa5f Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Sat, 25 Oct 2025 17:58:02 -0400 Subject: [PATCH 03/51] Fix redundant declaration of MPI_Recv_c Signed-off-by: Joseph Schuchart --- ompi/include/mpi.h.in | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ompi/include/mpi.h.in b/ompi/include/mpi.h.in index 5730d67ef8a..e06865b182f 100644 --- a/ompi/include/mpi.h.in +++ b/ompi/include/mpi.h.in @@ -2195,8 +2195,6 @@ OMPI_DECLSPEC int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int sou int tag, MPI_Comm comm, MPI_Status *status); OMPI_DECLSPEC int MPI_Recv_c(void *buf, MPI_Count count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status); -OMPI_DECLSPEC int MPI_Recv_c(void *buf, MPI_Count count, MPI_Datatype datatype, int source, - int tag, MPI_Comm comm, MPI_Status *status); OMPI_DECLSPEC int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm); OMPI_DECLSPEC int MPI_Reduce_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype, @@ -3371,8 +3369,6 @@ OMPI_DECLSPEC int PMPI_Recv(void *buf, int count, MPI_Datatype datatype, int so int tag, MPI_Comm comm, MPI_Status *status); OMPI_DECLSPEC int PMPI_Recv_c(void *buf, MPI_Count count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status); -OMPI_DECLSPEC int PMPI_Recv_c(void *buf, MPI_Count count, MPI_Datatype datatype, int source, - int tag, MPI_Comm comm, MPI_Status *status); OMPI_DECLSPEC int PMPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm); OMPI_DECLSPEC int PMPI_Reduce_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype, From 9bd775769b2d64286a50c768d3a543312de4aaa5 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 2 Nov 2022 09:51:54 -0400 Subject: [PATCH 04/51] tuned: use tree instead of bruck at scale The switch from tree to bruck between 512 and 1023 processes leads to unexpected latency changes in benchmarks of other collectives. We should be consistent here. There is no good reason for why bruck would perform better in that range but not beyond. Signed-off-by: Joseph Schuchart --- ompi/mca/coll/tuned/coll_tuned_decision_fixed.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index fa31aef1860..d210ff4412f 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -490,14 +490,8 @@ int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm, alg = 3; } else if (communicator_size < 256) { alg = 4; - } else if (communicator_size < 512) { - alg = 6; - } else if (communicator_size < 1024) { - alg = 4; - } else if (communicator_size < 4096) { - alg = 6; } else { - alg = 4; + alg = 6; } return ompi_coll_tuned_barrier_intra_do_this (comm, module, From 0213485ae2067d54381806d9c617eef89837207b Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Mon, 27 Oct 2025 08:49:58 -0600 Subject: [PATCH 05/51] make type_get_value_index language aware if input index type is not fortran, only check for corresponding 'c' pair types. Related to #13458 Checked against the test case in mpi4py that's activated when one declares one's MPI is 4.1 or higher compliant. Signed-off-by: Howard Pritchard --- ompi/datatype/ompi_datatype_create.c | 64 +++++++++++++++------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/ompi/datatype/ompi_datatype_create.c b/ompi/datatype/ompi_datatype_create.c index 4b01f2dfed6..76e87a0593f 100644 --- a/ompi/datatype/ompi_datatype_create.c +++ b/ompi/datatype/ompi_datatype_create.c @@ -133,42 +133,46 @@ int ompi_datatype_get_value_index(const ompi_datatype_t *value_type, const ompi_datatype_t *index_type, ompi_datatype_t **pair_type) { *pair_type = (ompi_datatype_t *)&ompi_mpi_datatype_null; - - /* C predefined data types */ - if (index_type->id == OMPI_DATATYPE_MPI_INT) { - if (value_type->id == OMPI_DATATYPE_MPI_FLOAT) { - *pair_type = (ompi_datatype_t *)&ompi_mpi_float_int; - } else if (value_type->id == OMPI_DATATYPE_MPI_DOUBLE) { - *pair_type = (ompi_datatype_t *)&ompi_mpi_double_int; - } else if (value_type->id == OMPI_DATATYPE_MPI_LONG) { - *pair_type = (ompi_datatype_t *)&ompi_mpi_long_int; - } else if (value_type->id == OMPI_DATATYPE_MPI_SHORT) { - *pair_type = (ompi_datatype_t *)&ompi_mpi_short_int; - } else if (value_type->id == OMPI_DATATYPE_MPI_INT) { - *pair_type = (ompi_datatype_t *)&ompi_mpi_2int; - } else if (value_type->id == OMPI_DATATYPE_MPI_LONG_DOUBLE) { - *pair_type = (ompi_datatype_t *)&ompi_mpi_longdbl_int; + bool is_fortran = ((index_type->super.flags & OMPI_DATATYPE_FLAG_DATA_FORTRAN) == OMPI_DATATYPE_FLAG_DATA_FORTRAN) ? true : false; + + if (false == is_fortran) { + if (index_type->id == OMPI_DATATYPE_MPI_INT) { + if (value_type->id == OMPI_DATATYPE_MPI_FLOAT) { + *pair_type = (ompi_datatype_t *)&ompi_mpi_float_int; + } else if (value_type->id == OMPI_DATATYPE_MPI_DOUBLE) { + *pair_type = (ompi_datatype_t *)&ompi_mpi_double_int; + } else if (value_type->id == OMPI_DATATYPE_MPI_LONG) { + *pair_type = (ompi_datatype_t *)&ompi_mpi_long_int; + } else if (value_type->id == OMPI_DATATYPE_MPI_SHORT) { + *pair_type = (ompi_datatype_t *)&ompi_mpi_short_int; + } else if (value_type->id == OMPI_DATATYPE_MPI_INT) { + *pair_type = (ompi_datatype_t *)&ompi_mpi_2int; + } else if (value_type->id == OMPI_DATATYPE_MPI_LONG_DOUBLE) { + *pair_type = (ompi_datatype_t *)&ompi_mpi_longdbl_int; + } } /* Fortran predefined data types */ - } else if ((index_type->id == OMPI_DATATYPE_MPI_INTEGER) && - (value_type->id == OMPI_DATATYPE_MPI_INTEGER)) { - *pair_type = (ompi_datatype_t *)&ompi_mpi_2integer; - } else if ((index_type->id == OMPI_DATATYPE_MPI_FLOAT) && - (value_type->id == OMPI_DATATYPE_MPI_FLOAT)) { - *pair_type = (ompi_datatype_t *)&ompi_mpi_2real; - } else if ((index_type->id == OMPI_DATATYPE_MPI_DOUBLE) && - (value_type->id == OMPI_DATATYPE_MPI_DOUBLE)) { - *pair_type = (ompi_datatype_t *)&ompi_mpi_2dblprec; + } else { + if ((index_type->id == OMPI_DATATYPE_MPI_INTEGER) && + (value_type->id == OMPI_DATATYPE_MPI_INTEGER)) { + *pair_type = (ompi_datatype_t *)&ompi_mpi_2integer; + } else if ((index_type->id == OMPI_DATATYPE_MPI_FLOAT) && + (value_type->id == OMPI_DATATYPE_MPI_FLOAT)) { + *pair_type = (ompi_datatype_t *)&ompi_mpi_2real; + } else if ((index_type->id == OMPI_DATATYPE_MPI_DOUBLE) && + (value_type->id == OMPI_DATATYPE_MPI_DOUBLE)) { + *pair_type = (ompi_datatype_t *)&ompi_mpi_2dblprec; #if OMPI_HAVE_FORTRAN_COMPLEX - } else if ((index_type->id == OMPI_DATATYPE_MPI_COMPLEX) && - (value_type->id == OMPI_DATATYPE_MPI_COMPLEX)) { - *pair_type = (ompi_datatype_t *)&ompi_mpi_2cplex; + } else if ((index_type->id == OMPI_DATATYPE_MPI_COMPLEX) && + (value_type->id == OMPI_DATATYPE_MPI_COMPLEX)) { + *pair_type = (ompi_datatype_t *)&ompi_mpi_2cplex; #endif #if OMPI_HAVE_FORTRAN_DOUBLE_COMPLEX - } else if ((index_type->id == OMPI_DATATYPE_MPI_DOUBLE_COMPLEX) && - (value_type->id == OMPI_DATATYPE_MPI_DOUBLE_COMPLEX)) { - *pair_type = (ompi_datatype_t *)&ompi_mpi_2dblcplex; + } else if ((index_type->id == OMPI_DATATYPE_MPI_DOUBLE_COMPLEX) && + (value_type->id == OMPI_DATATYPE_MPI_DOUBLE_COMPLEX)) { + *pair_type = (ompi_datatype_t *)&ompi_mpi_2dblcplex; #endif + } } return OMPI_SUCCESS; From b2d103c3842c8bfcfa66af526dd8701f3c256451 Mon Sep 17 00:00:00 2001 From: Van Man NGUYEN Date: Thu, 23 Oct 2025 11:40:06 +0200 Subject: [PATCH 06/51] UBCL: Add PML/UBCL and OSC/UBCL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Florent GERMAIN Co-authored-by: Pierre LEMARINIER Co-authored-by: Antoine CAPRA Co-authored-by: Emmanuel BRELLE Co-authored-by: Van Man NGUYEN Co-authored-by: Julien DUPRAT Co-authored-by: Tristan CALS Co-authored-by: Anton DAUMEN Co-authored-by: Alice CARIBONI Co-authored-by: François WELLENREITER Signed-off-by: Van Man NGUYEN --- config/ompi_check_ubcl.m4 | 57 + ompi/mca/common/ubcl/Makefile.am | 94 ++ ompi/mca/common/ubcl/common_ubcl.c | 173 +++ ompi/mca/common/ubcl/common_ubcl.h | 44 + ompi/mca/common/ubcl/configure.m4 | 29 + ompi/mca/osc/ubcl/Makefile.am | 53 + ompi/mca/osc/ubcl/README.md | 379 ++++++ ompi/mca/osc/ubcl/configure.m4 | 32 + ompi/mca/osc/ubcl/osc_ubcl.c | 556 +++++++++ ompi/mca/osc/ubcl/osc_ubcl.h | 189 +++ ompi/mca/osc/ubcl/osc_ubcl_accumulate.c | 1105 +++++++++++++++++ ompi/mca/osc/ubcl/osc_ubcl_datatype.c | 86 ++ ompi/mca/osc/ubcl/osc_ubcl_get.c | 167 +++ ompi/mca/osc/ubcl/osc_ubcl_info.c | 117 ++ ompi/mca/osc/ubcl/osc_ubcl_info.h | 24 + ompi/mca/osc/ubcl/osc_ubcl_put.c | 169 +++ ompi/mca/osc/ubcl/osc_ubcl_request.c | 109 ++ ompi/mca/osc/ubcl/osc_ubcl_request.h | 79 ++ ompi/mca/osc/ubcl/osc_ubcl_sync.c | 788 ++++++++++++ ompi/mca/osc/ubcl/osc_ubcl_sync.h | 45 + ompi/mca/osc/ubcl/osc_ubcl_utils.h | 37 + ompi/mca/pml/ubcl/Makefile.am | 52 + ompi/mca/pml/ubcl/configure.m4 | 35 + ompi/mca/pml/ubcl/pml_ubcl.c | 174 +++ ompi/mca/pml/ubcl/pml_ubcl.h | 197 +++ ompi/mca/pml/ubcl/pml_ubcl_component.c | 288 +++++ ompi/mca/pml/ubcl/pml_ubcl_datatype.c | 89 ++ ompi/mca/pml/ubcl/pml_ubcl_endpoint.c | 418 +++++++ ompi/mca/pml/ubcl/pml_ubcl_endpoint.h | 18 + ompi/mca/pml/ubcl/pml_ubcl_iprobe.c | 129 ++ ompi/mca/pml/ubcl/pml_ubcl_irecv.c | 292 +++++ ompi/mca/pml/ubcl/pml_ubcl_isend.c | 249 ++++ ompi/mca/pml/ubcl/pml_ubcl_progress.c | 38 + ompi/mca/pml/ubcl/pml_ubcl_request.c | 386 ++++++ ompi/mca/pml/ubcl/pml_ubcl_request.h | 311 +++++ ompi/mca/pml/ubcl/pml_ubcl_utils.c | 43 + ompi/mca/pml/ubcl/pml_ubcl_utils.h | 39 + ompi/mca/pml/ubcl/post_configure.sh | 2 + opal/mca/common/ubcl/Makefile.am | 105 ++ opal/mca/common/ubcl/common_ubcl.c | 445 +++++++ opal/mca/common/ubcl/common_ubcl.h | 72 ++ opal/mca/common/ubcl/configure.m4 | 27 + opal/mca/common/ubcl/help-mpi-common-ubcl.txt | 28 + 43 files changed, 7769 insertions(+) create mode 100644 config/ompi_check_ubcl.m4 create mode 100644 ompi/mca/common/ubcl/Makefile.am create mode 100644 ompi/mca/common/ubcl/common_ubcl.c create mode 100644 ompi/mca/common/ubcl/common_ubcl.h create mode 100644 ompi/mca/common/ubcl/configure.m4 create mode 100644 ompi/mca/osc/ubcl/Makefile.am create mode 100644 ompi/mca/osc/ubcl/README.md create mode 100644 ompi/mca/osc/ubcl/configure.m4 create mode 100644 ompi/mca/osc/ubcl/osc_ubcl.c create mode 100644 ompi/mca/osc/ubcl/osc_ubcl.h create mode 100644 ompi/mca/osc/ubcl/osc_ubcl_accumulate.c create mode 100644 ompi/mca/osc/ubcl/osc_ubcl_datatype.c create mode 100644 ompi/mca/osc/ubcl/osc_ubcl_get.c create mode 100644 ompi/mca/osc/ubcl/osc_ubcl_info.c create mode 100644 ompi/mca/osc/ubcl/osc_ubcl_info.h create mode 100644 ompi/mca/osc/ubcl/osc_ubcl_put.c create mode 100644 ompi/mca/osc/ubcl/osc_ubcl_request.c create mode 100644 ompi/mca/osc/ubcl/osc_ubcl_request.h create mode 100644 ompi/mca/osc/ubcl/osc_ubcl_sync.c create mode 100644 ompi/mca/osc/ubcl/osc_ubcl_sync.h create mode 100644 ompi/mca/osc/ubcl/osc_ubcl_utils.h create mode 100644 ompi/mca/pml/ubcl/Makefile.am create mode 100644 ompi/mca/pml/ubcl/configure.m4 create mode 100644 ompi/mca/pml/ubcl/pml_ubcl.c create mode 100644 ompi/mca/pml/ubcl/pml_ubcl.h create mode 100644 ompi/mca/pml/ubcl/pml_ubcl_component.c create mode 100644 ompi/mca/pml/ubcl/pml_ubcl_datatype.c create mode 100644 ompi/mca/pml/ubcl/pml_ubcl_endpoint.c create mode 100644 ompi/mca/pml/ubcl/pml_ubcl_endpoint.h create mode 100644 ompi/mca/pml/ubcl/pml_ubcl_iprobe.c create mode 100644 ompi/mca/pml/ubcl/pml_ubcl_irecv.c create mode 100644 ompi/mca/pml/ubcl/pml_ubcl_isend.c create mode 100644 ompi/mca/pml/ubcl/pml_ubcl_progress.c create mode 100644 ompi/mca/pml/ubcl/pml_ubcl_request.c create mode 100644 ompi/mca/pml/ubcl/pml_ubcl_request.h create mode 100644 ompi/mca/pml/ubcl/pml_ubcl_utils.c create mode 100644 ompi/mca/pml/ubcl/pml_ubcl_utils.h create mode 100644 ompi/mca/pml/ubcl/post_configure.sh create mode 100644 opal/mca/common/ubcl/Makefile.am create mode 100644 opal/mca/common/ubcl/common_ubcl.c create mode 100644 opal/mca/common/ubcl/common_ubcl.h create mode 100644 opal/mca/common/ubcl/configure.m4 create mode 100644 opal/mca/common/ubcl/help-mpi-common-ubcl.txt diff --git a/config/ompi_check_ubcl.m4 b/config/ompi_check_ubcl.m4 new file mode 100644 index 00000000000..d44a6e4b6cc --- /dev/null +++ b/config/ompi_check_ubcl.m4 @@ -0,0 +1,57 @@ +# -*- shell-script -*- +# +# Copyright (C) 2015-2017 Mellanox Technologies, Inc. +# All rights reserved. +# Copyright (c) 2015 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# Copyright (c) 2016 Los Alamos National Security, LLC. All rights +# reserved. +# Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. +# Copyright (c) 2024-2025 Bull S.A.S. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# OMPI_CHECK_UBCL(prefix, [action-if-found], [action-if-not-found]) +# -------------------------------------------------------- +# check if UBCL support can be found. sets prefix_{CPPFLAGS, +# as needed and runs action-if-found if there is +# support, otherwise executes action-if-not-found +AC_DEFUN([OMPI_CHECK_UBCL],[ + OPAL_VAR_SCOPE_PUSH([ompi_check_ubcl_dir ompi_check_ubcl_happy]) + + m4_ifblank([$1], [m4_fatal([First argument to OMPI_CHECK_UBCL cannot be blank])]) + + AC_ARG_WITH([ubcl], + [AC_HELP_STRING([--with-ubcl(=DIR)], + [Build with UBCL support])]) + + # UBCL is dlopen'd to avoid direct link to libubcl.so. + # OAC_CHECK_PACKAGE would add this explicit link, so it cannot be used. + # OPAL_CHECK_WITHDIR prints an error if the given path is invalid + OPAL_CHECK_WITHDIR([ubcl], [$with_ubcl], [include/ubcl_api.h]) + + AS_IF([test "$with_ubcl" == "no"], + [ompi_check_ubcl_happy="no"], + + [test -z "$with_ubcl"], + [ompi_check_ubcl_happy="no"], + + [ompi_check_ubcl_happy="yes" + $1_CPPFLAGS="${$1_CPPFLAGS} -I$with_ubcl/include/" + AC_MSG_NOTICE([$1_CPPFLAGS is set to: ${$1_CPPFLAGS}])]) + + + OPAL_SUMMARY_ADD([Transports],[UBCL],[],[$ompi_check_ubcl_happy]) + + AS_IF([test "$ompi_check_ubcl_happy" = "yes"], + [$2], + [$3]) + + OPAL_VAR_SCOPE_POP +]) + diff --git a/ompi/mca/common/ubcl/Makefile.am b/ompi/mca/common/ubcl/Makefile.am new file mode 100644 index 00000000000..0cd4eb083ef --- /dev/null +++ b/ompi/mca/common/ubcl/Makefile.am @@ -0,0 +1,94 @@ +# Copyright (c) 2025 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +#AM_CPPFLAGS = $(common_ubcl_CPPFLAGS) + +common_ubcl_sources = \ + common_ubcl.c \ + common_ubcl.h + +lib_LTLIBRARIES = +noinst_LTLIBRARIES = + +#Common component naming is forced by MCA_PROCESS_COMPONENT in config/opal_mca.m4 +# to lib${_LIB_NAME}mca_common_ubcl.la but OMPI_LIB_NAME does not exist +# so let's hope that no other project name is empty or there are no other common +comp_inst = libmca_common_ubcl.la +comp_noinst = libmca_common_ubcl_noinst.la + +if MCA_BUILD_ompi_common_ubcl_DSO +lib_LTLIBRARIES += $(comp_inst) +else +noinst_LTLIBRARIES += $(comp_noinst) +endif + +libmca_common_ubcl_la_SOURCES = $(common_ubcl_sources) +libmca_common_ubcl_la_CFLAGS = $(common_ubcl_CFLAGS) +libmca_common_ubcl_la_CPPFLAGS = $(common_ubcl_CPPFLAGS) +libmca_common_ubcl_la_LDFLAGS = $(common_ubcl_LDFLAGS) +libmca_common_ubcl_la_LIBADD = $(common_ubcl_LIBS) \ + $(OPAL_TOP_BUILDDIR)/opal/mca/common/ubcl/lib@OPAL_LIB_NAME@mca_common_ubcl.la + +libmca_common_ubcl_noinst_la_SOURCES = $(common_ubcl_sources) + +# Conditionally install the header files + +if WANT_INSTALL_HEADERS +ompidir = $(ompiincludedir)/$(subdir) +ompi_HEADERS = common_ubcl.h +endif + + +# This library is linked against various MCA components. +# There's two cases: +# +# 1. libmca_common_ubcl.la is a shared library. By linking that shared +# library to all components that need it, the OS linker will +# automatically load it into the process as necessary, and there will +# only be one copy (i.e., all the components will share *one* copy of +# the code and data). +# +# 2. libmca_common_ubcl.la is a static library. In this case, it will +# be rolled up into the top-level libmpi.la. It will also be rolled +# into each component, but then the component will also be rolled up +# into the upper-level libmpi.la. Linkers universally know how to +# "figure this out" so that we end up with only one copy of the code +# and data. +# +# As per above, we'll either have an installable or noinst result. +# The installable one should follow the same MCA prefix naming rules +# (i.e., libmca__.la). The noinst one can be named +# whatever it wants, although libmca___noinst.la is +# recommended. + +# To simplify components that link to this library, we will *always* +# have an output libtool library named libmca__.la -- even +# for case 2) described above (i.e., so there's no conditional logic +# necessary in component Makefile.am's that link to this library). +# Hence, if we're creating a noinst version of this library (i.e., +# case 2), we sym link it to the libmca__.la name +# (libtool will do the Right Things under the covers). See the +# all-local and clean-local rules, below, for how this is effected. +# These two rules will sym link the "noinst" libtool library filename +# to the installable libtool library filename in the case where we are +# compiling this component statically (case 2), described above). +V=0 +OMPI_V_LN_SCOMP = $(ompi__v_LN_SCOMP_$V) +ompi__v_LN_SCOMP_ = $(ompi__v_LN_SCOMP_$AM_DEFAULT_VERBOSITY) +ompi__v_LN_SCOMP_0 = @echo " LN_S " `basename $(comp_inst)`; + +all-local: + $(OMPI_V_LN_SCOMP) if test -z "$(lib_LTLIBRARIES)"; then \ + rm -f "$(comp_inst)"; \ + $(LN_S) "$(comp_noinst)" "$(comp_inst)"; \ + fi + +clean-local: + if test -z "$(lib_LTLIBRARIES)"; then \ + rm -f "$(comp_inst)"; \ + fi diff --git a/ompi/mca/common/ubcl/common_ubcl.c b/ompi/mca/common/ubcl/common_ubcl.c new file mode 100644 index 00000000000..0e95c01d66a --- /dev/null +++ b/ompi/mca/common/ubcl/common_ubcl.c @@ -0,0 +1,173 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include + +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/include/mpi.h" +#include "ompi/runtime/mpiruntime.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/mca/pml/pml_constants.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "opal/util/output.h" + +/* Default ompi_common_ubcl values */ +mca_ompi_common_ubcl_component_t mca_ompi_common_ubcl_component = { + .n_addr = 32, +}; + +static int mca_common_ubcl_find_rank(const struct ompi_communicator_t *comm, const uint64_t wrank) +{ + mca_pml_ubcl_comm_t *pml_comm = comm->c_pml_comm; + + if (NULL == comm->c_pml_comm) { + common_ubcl_error("UBCL error: no translation array in comm"); + abort(); + } + + for (uint32_t i = 0; i < pml_comm->size; i++) { + if (pml_comm->array[i] == wrank) { + return i; + } + } + + common_ubcl_error("UBCL error irank translation"); + + return 0; +} + +int mca_common_ubcl_get_mpi_rank(const int rank, const struct ompi_communicator_t *comm, + const uint64_t ubcl_rank) +{ + if (OMPI_ANY_SOURCE == rank) { + return mca_common_ubcl_find_rank(comm, ubcl_rank); + } else { + return rank; + } +} + +void mca_common_ubcl_status_to_ompi(ompi_status_public_t *status, + ubcl_status_t ubcl_status, + struct ompi_communicator_t *comm, int rank) +{ + if (MPI_STATUS_IGNORE != status) { + status->_cancelled = 0; //TODO output the information of cancel + status->_ucount = ubcl_status.size; + status->MPI_TAG = (int) ubcl_status.tag; + status->MPI_SOURCE = mca_common_ubcl_get_mpi_rank(rank, comm, ubcl_status.remote); + } +} + +int ubcl_error_to_ompi(ubcl_error_t code) +{ + int ret; + switch (code) { + case UBCL_SUCCESS: + ret = OPAL_SUCCESS; + break; + case UBCL_ERROR: + ret = OPAL_ERROR; + break; + case UBCL_ERR_RESOURCE_BUSY: + ret = OPAL_ERR_RESOURCE_BUSY; + break; + case UBCL_ERR_OUT_OF_RESOURCE: + ret = OPAL_ERR_OUT_OF_RESOURCE; + break; + case UBCL_ERR_NOT_IMPLEMENTED: + ret = OPAL_ERR_NOT_IMPLEMENTED; + break; + case UBCL_ERR_NOT_AVAILABLE: + ret = OPAL_ERR_NOT_AVAILABLE; + break; + case UBCL_ERR_TEMP_OUT_OF_RESOURCE: + ret = OPAL_ERR_TEMP_OUT_OF_RESOURCE; + break; + case UBCL_ERR_ARG_INVALID: + ret = OPAL_ERR_BAD_PARAM; + break; + case UBCL_ERR_TOO_LATE: + ret = OPAL_ERR_TIMEOUT; + break; + case UBCL_ERR_TRUNCATE: + ret = MPI_ERR_TRUNCATE; + break; + default: + ret = OPAL_ERROR; + break; + } + + return ret; +} + +void _mca_common_ubcl_error(char *filename, int line, int err, + char abort, int verbose, int output, + int is_init, int comp_verbose, + char *comp_name, char *format, ...) +{ + int n_addr = 0; + void **stack_buffer = NULL; + char **stack = NULL; + + stack_buffer = malloc(sizeof(void *) * mca_ompi_common_ubcl_component.n_addr); + n_addr = backtrace(stack_buffer, mca_ompi_common_ubcl_component.n_addr); + stack = backtrace_symbols(stack_buffer, n_addr); + + int char_per_line = 256; + int n_char = char_per_line * n_addr + 1024; + char *msg = malloc(n_char * sizeof(char)); + + if (NULL == stack || NULL == msg) { + /* Output small error */ + opal_output_verbose(verbose, output, + "========\n== ERROR: Not enough memory while outputting error...\n== " + "%s encountered an error (%d) at %s:%d\n========\n", + comp_name, err, filename, line); + } else { + /* Output full error */ + int current = 0; + current += snprintf(msg + current, n_char - current, + "========\n== %s encountered an error (%d) at %s:%d\n== %s:\n\t", + comp_name, err, filename, line, abort ? "ERROR" : "WARNING"); + va_list arglist; + va_start(arglist, format); + current += vsnprintf(msg + current, n_char - current, format, arglist); + va_end(arglist); + + current += snprintf(msg + current, n_char - current, "\n== STACK:\n"); + + for (int i = 0; i < n_addr; i++) { + size_t min_char = char_per_line < (n_char - current) ? char_per_line : n_char - current; + current += snprintf(msg + current, min_char, "= [%2d] %s\n", i, + stack[i]); + } + + if (is_init && output > 0) { + opal_output_verbose(verbose, output, + "%s========", msg); + } else if (abort || comp_verbose >= verbose) { + fprintf(stderr, "%s\n", msg); + fflush(stderr); + } + } + + if (abort) { + OMPI_ERRHANDLER_INVOKE(&ompi_mpi_comm_world.comm, err, stack[0]); + ompi_mpi_abort(&ompi_mpi_comm_world.comm, err); + } + + free(stack_buffer); + free(stack); + free(msg); +} diff --git a/ompi/mca/common/ubcl/common_ubcl.h b/ompi/mca/common/ubcl/common_ubcl.h new file mode 100644 index 00000000000..d23ccf3152d --- /dev/null +++ b/ompi/mca/common/ubcl/common_ubcl.h @@ -0,0 +1,44 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OMPI_MCA_COMMON_UBCL_H +#define OMPI_MCA_COMMON_UBCL_H + +#include + +#include "ompi/communicator/communicator.h" +#include "ompi/include/mpi.h" +#include "opal/mca/common/ubcl/common_ubcl.h" + +/* Holds common variable used in multiple UBCL components */ +struct mca_ompi_common_ubcl_component_s { + int n_addr; /**< Max number of void * addresses in printed stack*/ +}; +typedef struct mca_ompi_common_ubcl_component_s mca_ompi_common_ubcl_component_t; +extern mca_ompi_common_ubcl_component_t mca_ompi_common_ubcl_component; + +int mca_common_ubcl_get_mpi_rank(const int rank, const struct ompi_communicator_t *comm, + const uint64_t ubcl_rank); +void mca_common_ubcl_status_to_ompi(ompi_status_public_t *status, + ubcl_status_t ubcl_status, + struct ompi_communicator_t *comm, int rank); +int ubcl_error_to_ompi(ubcl_error_t code); +/* UBCL rank is on 61 bits, ompi jobid is 32bits, vpid must be truncated to 29bits */ +#define COMMON_UBCL_VPID_MAX (((1 << 29) - 1)) /* We need 3 bits for UBCL rank */ +#define PML_UBCL_JOBID_MAX (OPAL_JOBID_MAX) + +/* Error and warning output function used by UBCL components */ +void _mca_common_ubcl_error(char *filename, int line, int err, char abort, int verbose, + int output, int is_init, int comp_verbose, char *comp_name, + char *format, ...); + + +#endif /* OMPI_MCA_COMMON_UBCL_H */ + diff --git a/ompi/mca/common/ubcl/configure.m4 b/ompi/mca/common/ubcl/configure.m4 new file mode 100644 index 00000000000..42ba29cf67a --- /dev/null +++ b/ompi/mca/common/ubcl/configure.m4 @@ -0,0 +1,29 @@ +# -*- shell-script -*- +# +# Copyright (c) 2025 Bull S.A.S. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AC_DEFUN([MCA_ompi_common_ubcl_CONFIG],[ + AC_CONFIG_FILES([ompi/mca/common/ubcl/Makefile]) + + OMPI_CHECK_UBCL([ompi_common_ubcl], + [common_ubcl_happy="yes"], + [common_ubcl_happy="no"]) + + + AC_REQUIRE([MCA_opal_common_ubcl_CONFIG]) + + AS_IF([test "$common_ubcl_happy" = "yes"], + [$1], + [$2]) + + # substitute in the things needed to build ubcl + AC_SUBST([common_ubcl_CPPFLAGS]) + AC_SUBST([common_ubcl_LDFLAGS]) + AC_SUBST([common_ubcl_LIBS]) +])dnl diff --git a/ompi/mca/osc/ubcl/Makefile.am b/ompi/mca/osc/ubcl/Makefile.am new file mode 100644 index 00000000000..90a8b67d4f5 --- /dev/null +++ b/ompi/mca/osc/ubcl/Makefile.am @@ -0,0 +1,53 @@ +# Copyright (c) 2025 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CPPFLAGS = $(osc_ubcl_CPPFLAGS) + +dist_ompidata_DATA = + +# EXTRA_DIST = post_configure.sh + +ubcl_sources = \ + osc_ubcl_utils.h \ + osc_ubcl.h \ + osc_ubcl.c \ + osc_ubcl_put.c \ + osc_ubcl_accumulate.c \ + osc_ubcl_datatype.c \ + osc_ubcl_info.c \ + osc_ubcl_info.h \ + osc_ubcl_get.c\ + osc_ubcl_request.c \ + osc_ubcl_request.h \ + osc_ubcl_sync.c \ + osc_ubcl_sync.h + +if MCA_BUILD_ompi_osc_ubcl_DSO +component_noinst = +component_install = mca_osc_ubcl.la +else +component_noinst = libmca_osc_ubcl.la +component_install = +endif + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_osc_ubcl_la_SOURCES = $(ubcl_sources) +mca_osc_ubcl_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \ + $(osc_ubcl_LIBS) \ + $(OPAL_TOP_BUILDDIR)/opal/mca/common/ubcl/lib@OPAL_LIB_NAME@mca_common_ubcl.la \ + $(OMPI_TOP_BUILDDIR)/ompi/mca/common/ubcl/libmca_common_ubcl.la + +mca_osc_ubcl_la_LDFLAGS = -module -avoid-version $(osc_ubcl_LDFLAGS) +mca_osc_ubcl_la_CPPFLAGS = -Wextra -Wall -Werror -Wno-unused-parameter -Wno-missing-field-initializers $(osc_ubcl_CPPFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_osc_ubcl_la_SOURCES = $(ubcl_sources) +libmca_osc_ubcl_la_LIBADD = $(osc_ubcl_LIBS) +libmca_osc_ubcl_la_LDFLAGS = -module -avoid-version $(osc_ubcl_LDFLAGS) +libmca_osc_ubcl_la_CPPFLAGS = $(mca_osc_ubcl_la_CPPFLAGS) diff --git a/ompi/mca/osc/ubcl/README.md b/ompi/mca/osc/ubcl/README.md new file mode 100644 index 00000000000..d26070e6ecf --- /dev/null +++ b/ompi/mca/osc/ubcl/README.md @@ -0,0 +1,379 @@ +# INTRODUCTION + +## How to use : +Configure using the `--with-ubcl=` command with a correct ubcl install. +It needs to find ubcl to build the component. +You cannot use the OSC/UBCL without the PML/UBCL as the former relies on the latter +for UBCL endpoints + +The OSC/UBCL also relies on the opal/mca/common/ubcl and the ompi/mca/common/ubcl. + + + +# Architecture : + +## Most used data structures + +Current UCBL API (for osc calls) resembles that of the MPI specification. +The idea is to offload as much as possible to UBCL, all this component needs to do +is translate arguments and make valid calls to UBCL. + + +The component is shared to all windows and mainly contains fields to help print log, +for now. The generic inherited class also brings necessary data to open the component +and create windows/modules such as *osc_init*, *osc_select* or *osc_finish*. + + +```c +struct mca_osc_ubcl_module_s { + ompi_osc_base_module_t super; + struct ompi_communicator_t *comm; + struct ompi_win_t *win; + int64_t wid; + union {int *all; int uniq;} disp_unit; + ubcl_win_flags_t win_flags; + + uint32_t same_disp_unit:1; + uint32_t no_locks:1; + uint32_t padding_infos:30; + + ubcl_win_sync_type_t sync_type; + ubcl_win_sync_type_t *procs_sync_type; + int64_t passive_lock_refcount; + opal_mutex_t sync_lock; + + unsigned int nb_rank_waited; + struct ompi_group_t *active_sync_access_group; + struct ompi_group_t *active_sync_exposure_group; + + void *free_after; +}; +typedef struct mca_osc_ubcl_module_s mca_osc_ubcl_module_t; +``` + +The module is specific to one window and it holds in fact the fields necessary to +that window such as the parent classes for example. +The super field holds the available one-sided communications while win holds the necessary +data to compute what's needed for the window at a higher level than the osc/ubcl. +We fill the function pointers of super by coppying them from a template established +in `osc_ubcl_component.c` file. +It means in theory that we could deactivate or switch API calls to some other one-sided +function but in pratice every window have the same calls. +The communicator field is a duplicated communicator of the one that was used to +start the window with. It is necessary info about the group of procs of the window, +in part in order to synchronize procs at window create/free using regular MPI +collectives without introducing deadlocks on the original communicator. +The wid is a unique id to identify the window. +The `win_flags` are essential on window creation and track for ubcl which channel +the window can use (*bxi*, *shm* or *self*). + +```c +enum ubcl_win_sync_type { + UBCL_WIN_SYNC_NONE, + UBCL_WIN_SYNC_LOCK, + UBCL_WIN_SYNC_LOCK_NO_CHECK, + UBCL_WIN_SYNC_LOCK_ALL, + UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK, + UBCL_WIN_SYNC_PSCW, + UBCL_WIN_SYNC_FENCE, + UBCL_WIN_SYNC_FENCE_EPOCH +}; +typedef enum ubcl_win_sync_type_t +``` + +`sync_type` and `procs_sync_type` are enums used to track the type of synchronization +used on the whole window and for each *access epoch* respectively. It has a debugging +purpose, checks correct use of the mpi interface and is also mandatory to handle +the various behaviors of the synchronization functions such as the `fence` epoch +only starting at the first communication. +`sync_type` is a global window status whereas `procs_sync_type` proxies the information +at the scale of each peer rank when needed by the sync type. +`sync_lock` is a lock used to guarantee only one thread access to the thread critical +fields (`sync_type`, `procs_sync_type`, `passive_lock_refcount`, `nb_tests_passed`). + +`nb_tests_passed`is used exclusively by *Test* and *Wait* to track which exposure +epoch (to which proc) was terminated. +`active_sync_access_group` and `active_sync_exposure_group` save the group used +to create the pscw epoch(s). It is needed to complete the *Complete* *Wait* and +*Test* operations as well as the one-sided communications. +`Free_after` is a pointer to memory attached to the window that needs to be freed +alongside the window. + + + +## Window creation + +```c +MPI_Win_create + ↓ +ompi_win_create + ↓ +ompi_osc_base_select + ↓ ↑ | +osc_ubcl_query -┘ | + ↓ + component_select + ↓ + new_module + ↓ + win_create + ↓ + ubcl_win_create +``` + +Each time the user requests a window to be created they will follow this diagram +of function calls. +`osc_ubcl_query` returns the priority depending on the requested window flavor. +`ompi_osc_base_select` will select the component with the highest priority as the +osc for the window. +`component_select` calls the function to create the window/module and enforces synchronization with a barrier +`new_module` allocates a new module and then copies the module template on itself +`win_create` prepares the `win_flags` for UBCL, based on the PML endpoints types of the communicator. +`ubcl_win_create` creates the window inside the UBCL library + + +### Dynamic windows + +Giving the flavor `MPI_WIN_FLAVOR_DYNAMIC` allows OMPI to create a dynamic window. +We then need to attach a buffer (or more) to it with *win_attach* and *win_detach*. +Since the window buffer is handled by UBCL *win_attach* and *win_detach* do very little. + + +## Synchronization + +### Generalities + +To enable synchronization we need every procs involved to be inside the same window +and for an *epoch* to be opened. +To that effect we store the type of synchronization inside the `osc_module` which +means the type of the epoch opened (so either *passive* or *active* and which +one precisely) because we can't run a one-sided communication without any synchronization. + +### Passive sync + +In *passive synchronization*, data is moved from the memory of one process to the +memory of another, and only the origin process is explicitly involved in the transfer. +Thus, two origin processes may communicate by accessing the same location in a +target window. +Despite the fact that no MPI call is required, the target process still needs to call +`ubcl_progress` to actively handle the request to establish the *lock*. + +#### Lock/Unlock + +```c +int ompi_osc_ubcl_lock(int lock_type, int target, int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_unlock(int target, struct ompi_win_t *win); + +int ompi_osc_ubcl_lock_all(int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_unlock_all(struct ompi_win_t *win); +``` + +The *lock*/*unlock* functions use the sync type `UBCL_WIN_SYNC_LOCK`. Lock is allowed +only if the window has a sync type `UBCL_WIN_SYNC_LOCK` or `UBCL_WIN_SYNC_NONE` +and if the target process is not yet locked. It marks the window (if not already +done) as sync type `UBCL_WIN_SYNC_LOCK`, and changes the target process sync type +to `UBCL_WIN_SYNC_LOCK` in the local array of locked processes. +It also increase the `passive_lock_refcount` which tracks the number of *locks* +done to allow *unlock* to reset the window type when it should. +*Unlock* requires the window to be in `UBCL_WIN_SYNC_LOCK` and have the target +process locked in the local array. + +The *lock_all*/*unlock_all* functions use the sync type `UBCL_WIN_SYNC_LOCK_ALL` +for the window only because we don't locally mark target processes as locked. +Otherwise it function the same as a simple *lock*. +As MPI requires that an initiator cannot lock the same target multiple times, +*lock_all* and *lock* are mutually exclusive despite similar names which leads to +a different sync type needed. +The main difference is that the UBCL call requires an array in argument that we +have to build. + + +In case we're provided with `MPI_MODE_NOCHECK` as assertion, we don't bother +actually locking the processes. However we still mark the target process as being +locked with `UBCL_WIN_SYNC_LOCK_NO_CHECK` and change the window sync type to +`UBCL_WIN_SYNC_LOCK` in case of the simple *lock*. +For *lock_all* we mark the window as having an epoch `UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK`. + + +#### Flush + +```c +int ompi_osc_ubcl_flush(int target, struct ompi_win_t *win); +int ompi_osc_ubcl_flush_all(struct ompi_win_t *win); +int ompi_osc_ubcl_flush_local(int target, struct ompi_win_t *win); +int ompi_osc_ubcl_flush_local_all(struct ompi_win_t *win); +``` + +The *flush* functions don't create any epoch and therefore don't have a sync type +associated. However, the *flush* functions can only be called if the current process has +a valid passive target access epoch on the target process. +They make sure that all the previous one-sided communications on the window, +from the initiator to the target, are completed. +As for now *flush_local* is an alias to *flush* and *flush[_local]_all* loops +on *flush[_local]*. + + +#### Sync + +When the data modifications are not fully handled by the NIC, some counterproductive +caches must be cleaned at the start of one-sided exposure epochs. +In active target synchronization model, a call is made on target side in fence and post functions. +As there is no MPI call on target side in passive target synchronization model, +this is handled internally by ubcl. + +### Active sync + +In *active synchronization*, data is moved from the memory of one process to the +memory of another, and both are explicitly involved in the synchronization. This +communication pattern is similar to message passing, except that all the data transfer +arguments are provided by one process, and the second process only participates in +the synchronization. + +#### PSCW + +```c +int ompi_osc_ubcl_post(struct ompi_group_t *group, int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_start(struct ompi_group_t *group, int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_complete(struct ompi_win_t *win); +int ompi_osc_ubcl_wait(struct ompi_win_t *win); +int ompi_osc_ubcl_test(struct ompi_win_t *win, int *flag); +``` + +The *PSCW* functions use the sync type `UBCL_WIN_SYNC_PSCW`. *Post* and *Start* requires +sync type `UBCL_WIN_SYNC_NONE` or `UBCL_WIN_SYNC_PSCW` on the window and not +have a *PSCW* synchronization group tracked already. *Complete* and *Wait* +requires sync type `UBCL_WIN_SYNC_PSCW` on the window and a non-NULL synchronization +group in the window. *Test* is the same as *Wait* but non-blocking. + +In UBCL, the functions only take one target at a time whereas OMPI PSCW functions +take a whole group. So *Post* and *Start* loop on the group given in argument and +the *Complete*, *Wait* and *Test* loop on the groups specified - and stored +inside the window - when the epoch was established. +Below is the correspondance list between UBCL and OMPI : +- `MPI_Win_post` => `ubcl_win_target_grants_lock` +- `MPI_Win_start` => `ubcl_win_initiator_waits_lock` +- `MPI_Win_complete` => `ubcl_win_initiator_releases_lock` +- `MPI_Win_wait` => `ubcl_win_target_waits_lock_release` +- `MPI_Win_test` => `ubcl_win_target_tests_lock_release` + +We don't make use of the assert argument here for any of the active target synchronization functions. + + +#### Fence + +```c +int ompi_osc_ubcl_fence(int assert, struct ompi_win_t *win); +``` + +The *fence* function uses the sync type `UBCL_WIN_SYNC_FENCE` and `UBCL_WIN_SYNC_FENCE_EPOCH`. +This synchronization scheme needs both types for correctness checks. No other +synchronization calls may be started unless all epochs are completed before. +The first fence sets the window sync type as `UBCL_WIN_SYNC_FENCE` and the first +one-sided communication that starts will begin a fence epoch, setting the sync type +to `UBCL_WIN_SYNC_FENCE_EPOCH`. +That also means we have to have to allow other synchronization schemes to start an +epoch on `UBCL_WIN_SYNC_FENCE` as if it's `UBCL_WIN_SYNC_NONE`. +The function flushes the one-sided communications started in the current epoch, +acting as a barrier. Additionally whencalled inside a *fence* epoch, it closes +said epoch. The sync type is back to `UBCL_WIN_SYNC_FENCE`. + +Here we take into account the `MPI_MODE_NOPRECEDE` and `MPI_MODE_NOSUCCEED` +assertions only. The first one allows us to skip flushing the previously started +one-sided communications since there are none. We don't exploit the second assertion +much except in the case where both values are given, then the *fence* doesn't do much. + + +## One-Sided Communicaions + +### Put + +```c +int ompi_osc_ubcl_put(const void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win); + +int ompi_osc_ubcl_rput(const void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req); +``` + +### Get + +```c +int ompi_osc_ubcl_get(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win); + +int ompi_osc_ubcl_rget(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req); +``` + +### Atomic operations + +```c +int ompi_osc_ubcl_accumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, + int target_count, struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, struct ompi_win_t *win); + +int ompi_osc_ubcl_raccumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, + int target_count, struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, struct ompi_win_t *win, + struct ompi_request_t **request); + +int ompi_osc_ubcl_get_accumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_datatype, void *result_addr, + int result_count, struct ompi_datatype_t *result_datatype, + int target_rank, ptrdiff_t target_disp, int target_count, + struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, + struct ompi_win_t *win); + +int ompi_osc_ubcl_rget_accumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_datatype, void *result_addr, + int result_count, struct ompi_datatype_t *result_datatype, + int target_rank, ptrdiff_t target_disp, int target_count, + struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, + struct ompi_win_t *win, struct ompi_request_t **request); + +int ompi_osc_ubcl_fetch_and_op(const void *origin_addr, void *result_addr, + struct ompi_datatype_t *dt, int target, ptrdiff_t target_disp, + struct ompi_op_t *op, struct ompi_win_t *win); + +int ompi_osc_ubcl_compare_and_swap(const void *origin_addr, const void *compare_addr, + void *result_addr, struct ompi_datatype_t *dt, int target, + ptrdiff_t target_disp, struct ompi_win_t *win); +``` + +The implementation makes use of the similarity between these functions so *accumulate* +calls *raccumulate* wih *ompi_req = NULL*, *raccumulate* calls *rget_accumulate* +with all result argument sets to NULL or 0. +*get_accumulate* calls *rget_acucmulate* with *ompi_req = NULL*. +*fetch_op* also only needs to call *get_accumulate* with the correct arguments. +*compare_and_swap* gets its own implementation. + + + diff --git a/ompi/mca/osc/ubcl/configure.m4 b/ompi/mca/osc/ubcl/configure.m4 new file mode 100644 index 00000000000..add1db7c94b --- /dev/null +++ b/ompi/mca/osc/ubcl/configure.m4 @@ -0,0 +1,32 @@ +# Copyright (c) 2025 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +AC_DEFUN([MCA_ompi_osc_ubcl_POST_CONFIG], [ + AS_IF([test "$1" = "1"], [OMPI_REQUIRE_ENDPOINT_TAG([PML])]) +]) + +AC_DEFUN([MCA_ompi_osc_ubcl_CONFIG], [ + AC_CONFIG_FILES([ompi/mca/osc/ubcl/Makefile]) + + AC_REQUIRE([MCA_ompi_common_ubcl_CONFIG]) + AC_REQUIRE([MCA_opal_common_ubcl_CONFIG]) + + OMPI_CHECK_UBCL([osc_ubcl], + [osc_ubcl_happy="yes"], + [osc_ubcl_happy="no"]) + + AS_IF([test "$osc_ubcl_happy" = "yes"], + [$1], + [$2]) + +# substitute in the things needed to build ubcl + AC_SUBST([osc_ubcl_CPPFLAGS]) + AC_SUBST([osc_ubcl_LDFLAGS]) + AC_SUBST([osc_ubcl_LIBS]) +]) diff --git a/ompi/mca/osc/ubcl/osc_ubcl.c b/ompi/mca/osc/ubcl/osc_ubcl.c new file mode 100644 index 00000000000..5e81ed1add3 --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl.c @@ -0,0 +1,556 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Bull eXtreme Interconnect OSC API implementation. + * + * Implementation of API defined in osc.h. To see parameters and return values + * of these functions, refer to ompi/mca/osc/osc.h. + */ + +#include "opal/include/opal_config.h" + +#include "ompi/mca/osc/ubcl/osc_ubcl_info.h" +#include "opal/mca/common/ubcl/common_ubcl.h" + +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_request.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_sync.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_utils.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_info.h" +#include "opal/util/proc.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" + +#include + +static int component_open(void); +static int component_register(void); +static int component_init(bool enable_progress_threads, bool enable_mpi_threads); +static int component_fini(void); +static int component_query(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t disp_unit, + struct ompi_communicator_t *comm, struct opal_info_t *info, int flavor); +static int component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t disp_unit, + struct ompi_communicator_t *comm, struct opal_info_t *info, int flavor, + int *model); +static int win_free(struct ompi_win_t *win); +static int shared_query(struct ompi_win_t *win, int rank, size_t *size, ptrdiff_t *disp_unit, + void *baseptr); +static int win_attach(struct ompi_win_t *win, void *base, size_t size); +static int win_detach(struct ompi_win_t *win, const void *base); + +mca_osc_ubcl_component_t mca_osc_ubcl_component = { + .super = { /* ompi_osc_base_component_t */ + .osc_version = { + OMPI_OSC_BASE_VERSION_4_0_0, + .mca_component_name = "ubcl", + MCA_BASE_MAKE_VERSION(component, + OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + .mca_open_component = component_open, + .mca_register_component_params = component_register, + }, + .osc_data = { + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE + }, + .osc_init = component_init, + .osc_query = component_query, + .osc_select = component_select, + .osc_finalize = component_fini, + }, + .is_init = 0 +}; + +mca_osc_ubcl_module_t mca_osc_ubcl_module_template = { + {shared_query, /* Since MPI 4.1, osc should not abort on unsupported shared_query */ + win_attach, + win_detach, + win_free, + + ompi_osc_ubcl_put, + ompi_osc_ubcl_get, + ompi_osc_ubcl_accumulate, + ompi_osc_ubcl_compare_and_swap, + ompi_osc_ubcl_fetch_and_op, + ompi_osc_ubcl_get_accumulate, + + ompi_osc_ubcl_rput, + ompi_osc_ubcl_rget, + ompi_osc_ubcl_raccumulate, + ompi_osc_ubcl_rget_accumulate, + + ompi_osc_ubcl_fence, + + ompi_osc_ubcl_start, + ompi_osc_ubcl_complete, + ompi_osc_ubcl_post, + ompi_osc_ubcl_wait, + ompi_osc_ubcl_test, + + ompi_osc_ubcl_lock, + ompi_osc_ubcl_unlock, + ompi_osc_ubcl_lock_all, + ompi_osc_ubcl_unlock_all, + + ompi_osc_ubcl_sync, + ompi_osc_ubcl_flush, + ompi_osc_ubcl_flush_all, + ompi_osc_ubcl_flush_local, + ompi_osc_ubcl_flush_local_all} +}; + +static int component_open(void) +{ + /* Open output stream */ + if (0 < mca_osc_ubcl_component.verbose) { + mca_osc_ubcl_component.output = opal_output_open(NULL); + int verbose = mca_osc_ubcl_component.verbose > 0 ? mca_osc_ubcl_component.verbose : 1; + opal_output_set_verbosity(mca_osc_ubcl_component.output, verbose); + } else { + mca_osc_ubcl_component.output = -1; + } + + return OMPI_SUCCESS; +} + +static int component_register(void) +{ + mca_base_component_t *component = &mca_osc_ubcl_component.super.osc_version; + + mca_osc_ubcl_component.priority = 0; + (void) mca_base_component_var_register(&mca_osc_ubcl_component.super.osc_version, "priority", + "Priority of the ubcl osc component", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, + &mca_osc_ubcl_component.priority); + + mca_osc_ubcl_component.verbose = 0; + (void) mca_base_component_var_register(component, "verbose", "Verbosity level of the osc/ubcl.", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_osc_ubcl_component.verbose); + + mca_osc_ubcl_component.max_req = 0; + (void) + mca_base_component_var_register(component, "max_requests", + "Maximum number of requests allocated. (0 means infinite)", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_osc_ubcl_component.max_req); + + mca_osc_ubcl_component.min_req = 0; + (void) mca_base_component_var_register(component, "min_requests", + "Minimum (and initial) number of requests allocated.", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_osc_ubcl_component.min_req); + + mca_osc_ubcl_component.incr_req = 1024; + (void) mca_base_component_var_register( + component, "incr_requests", + "Count of new requests allocated when free list runs out of requests.", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_osc_ubcl_component.incr_req); + + mca_common_ubcl_register_mca(); + + return OMPI_SUCCESS; +} + +static int component_init(bool enable_progress_threads, bool enable_mpi_threads) +{ + int err; + OPAL_OUTPUT_VERBOSE((50, mca_osc_ubcl_component.output, "UBCL_COMPONENT_INIT\n")); + + if (opal_atomic_fetch_add_64(&mca_osc_ubcl_component.is_init, 1)) { + return OMPI_SUCCESS; + } + + if (OPAL_SUCCESS != mca_common_ubcl_init()) { + mca_osc_ubcl_warn(OMPI_ERR_NOT_AVAILABLE, "common_ubcl could not load UBCL library\n"); + return OMPI_SUCCESS; + } + + OBJ_CONSTRUCT(&mca_osc_ubcl_component.req_free_list, opal_free_list_t); + err = opal_free_list_init(&mca_osc_ubcl_component.req_free_list, sizeof(mca_osc_ubcl_request_t), + opal_cache_line_size, OBJ_CLASS(mca_osc_ubcl_request_t), 0, + opal_cache_line_size, mca_osc_ubcl_component.min_req, + mca_osc_ubcl_component.max_req, mca_osc_ubcl_component.incr_req, NULL, + 0, NULL, NULL, NULL); + + if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) { + mca_osc_ubcl_warn(OMPI_ERR_OUT_OF_RESOURCE, "Not enough memory (%d)", err); + goto error_free_list; + } + + /* Initialize UBCL */ + if (UBCL_SUCCESS != ubcl_init(enable_mpi_threads || enable_progress_threads)) { + goto error_ubcl_init; + } + + /* Mark as initialized and return */ + OPAL_OUTPUT_VERBOSE((50, mca_osc_ubcl_component.output, "INITIATION DONE\n")); + return OMPI_SUCCESS; + +error_ubcl_init: + OBJ_DESTRUCT(&mca_osc_ubcl_component.req_free_list); +error_free_list: + mca_common_ubcl_fini(); + return OMPI_ERROR; +} + +static int component_fini(void) +{ + int ret; + OPAL_OUTPUT_VERBOSE((50, mca_osc_ubcl_component.output, "ubcl_COMPONENT_FINALIZE")); + + if (0 != opal_atomic_sub_fetch_64(&mca_osc_ubcl_component.is_init, 1)) { + return OMPI_SUCCESS; + } + + /* Finalize UBCL */ + ret = ubcl_error_to_ompi(ubcl_fini()); + if (OMPI_SUCCESS != ret) { + return ret; + } + + OBJ_DESTRUCT(&mca_osc_ubcl_component.req_free_list); + + mca_common_ubcl_fini(); + return OMPI_SUCCESS; +} + +static int component_query(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t disp_unit, + struct ompi_communicator_t *comm, struct opal_info_t *info, int flavor) +{ + uint64_t flags = 0; + int dev_id; + + if (MPI_WIN_FLAVOR_SHARED == flavor) { + return OPAL_ERR_NOT_IMPLEMENTED; + } + + if (0 == mca_common_ubcl_is_init()) { + return OPAL_ERR_NOT_INITIALIZED; + } + + /* Accelerator buffer is not supported as provided window buffer */ + if (MPI_WIN_FLAVOR_ALLOCATE != flavor && MPI_WIN_FLAVOR_DYNAMIC != flavor + && 0 < size && NULL != base && NULL != *base + && opal_accelerator.check_addr(*base, &dev_id, &flags) > 0) { + mca_osc_ubcl_log(20, "GPU buffer not supported by osc/ubcl"); + return OPAL_ERR_NOT_SUPPORTED; + } + + return mca_osc_ubcl_component.priority; +} + +static int win_create(void *base, size_t size, mca_osc_ubcl_module_t *module) +{ + ompi_proc_t *proc; + mca_common_ubcl_endpoint_t *endpoint; + ompi_group_t * win_group; + int ret = OMPI_SUCCESS; + + module->win_flags.bxi = 0; + module->win_flags.shm = 0; + module->win_flags.self = 0; + if (MPI_WIN_FLAVOR_DYNAMIC == module->win->w_flavor) { + module->win_flags.dynamic = 1; + } + + ompi_win_group(module->win, &win_group); + for (int i = 0; i < ompi_group_size(win_group); i++) { + proc = ompi_group_peer_lookup_existing(win_group, i); + if (OPAL_UNLIKELY(NULL == proc)) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "Cannot create window: %d-th proc is undefined", i); + goto exit; + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + if (NULL == endpoint) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "Cannot create window: %d-th UBCL endpoint is undefined", i); + goto exit; + } + + switch (endpoint->type) { + case UBCL_ENDPOINT_TYPE_SELF: + module->win_flags.self = 1; + break; + case UBCL_ENDPOINT_TYPE_SHMEM: + module->win_flags.shm = 1; + break; + case UBCL_ENDPOINT_TYPE_BXI: + module->win_flags.bxi = 1; + break; + default: + /* Should never happen, UBCL endpoints always have a type */ + mca_osc_ubcl_error(OMPI_ERROR, "Unknown endpoint type"); + } + } + + /* Endpoints are created by the osc/ubcl when ompi_init is called */ + ret = ubcl_error_to_ompi(ubcl_win_create(base, size, module->wid, module->win_flags)); +exit: + return ret; +} + +/* create a module structure */ +static int new_module(struct ompi_win_t *win, void **base, size_t size, + struct ompi_communicator_t *comm, int flavor, mca_osc_ubcl_module_t **pmodule) +{ + int ret = OMPI_ERROR; + void *win_ptr; + mca_osc_ubcl_module_t *module; + + /* Calloc is required to set all pointers to NULL and free them in case + * of error */ + module = (mca_osc_ubcl_module_t *) calloc(1, sizeof(mca_osc_ubcl_module_t)); + if (NULL == module) { + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + memcpy(module, &mca_osc_ubcl_module_template, sizeof(ompi_osc_base_module_t)); + + /* Allocate window buffer */ + if (MPI_WIN_FLAVOR_ALLOCATE == flavor) { + module->free_after = *base = malloc(size); + if (NULL == *base) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto error; + } + } else { + module->free_after = NULL; + } + + ret = ompi_comm_dup(comm, &module->comm); + if (OMPI_SUCCESS != ret) { + goto error; + } + + /* Putting the cid into the wid that way it should be unique */ + module->win = win; + module->wid = ompi_comm_get_local_cid(module->comm); + module->sync_type = UBCL_WIN_SYNC_NONE; + module->passive_lock_refcount = 0; + OBJ_CONSTRUCT(&module->sync_lock, opal_mutex_t); + module->nb_rank_waited = 0; + module->active_sync_access_group = NULL; + module->active_sync_exposure_group = NULL; + *pmodule = module; + + size_t comm_size = ompi_comm_size(comm); + module->procs_sync_type = malloc(sizeof(ubcl_win_sync_type_t) * comm_size); + if (NULL == module->procs_sync_type) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto error; + } + + for (size_t i = 0; i < comm_size; i++) { + module->procs_sync_type[i] = UBCL_WIN_SYNC_NONE; + } + + if (MPI_WIN_FLAVOR_DYNAMIC == flavor) { + /* For dynamic windows, base is MPI_BOTTOM, which is NULL, so it can't be dereferenced */ + win_ptr = (void *) base; + } else { + win_ptr = *base; + } + + return win_create(win_ptr, size, module); + +error: + /* According to MPI specifications 12.6.1, errors on window creations are fatal. + * That is why MPI API calls kill all ranks if the return value is not OMPI_SUCCESS. + * Therefore it is not an issue to leave this function without entering + * ompi_comm_dup collective call: other ranks will just be blocked in it + * before being sigkill'd. + */ + /* ompi_comm_free cannot be called here since it is a collective call. */ + free(module->procs_sync_type); + free(module->free_after); + free(module); + return ret; +} + +/* osc ubcl has been selected to exclusively handle the MPI RMA window, + * this is last call before real communications */ +static int component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t disp_unit, + struct ompi_communicator_t *comm, struct opal_info_t *info, int flavor, + int *model) +{ + mca_osc_ubcl_module_t *module = NULL; + int ret; + unsigned name_len = 1024; + char name[name_len]; + + /* Handle erroneous cases */ + if (MPI_WIN_FLAVOR_SHARED == flavor) { + return OPAL_ERR_NOT_IMPLEMENTED; + } + + /* Allocate first a module */ + ret = new_module(win, base, size, comm, flavor, &module); + if (OMPI_SUCCESS != ret) { + return ret; + } + + snprintf(name, name_len, "ubcl window %d, built on %s", ompi_comm_get_local_cid(module->comm), + comm->c_name); + ompi_win_set_name(win, name); + mca_osc_ubcl_log(20, "%s created", win->w_name); + + win->w_osc_module = &module->super; + module->win = win; + *model = MPI_WIN_UNIFIED; + + osc_ubcl_read_info(info, win); + osc_ubcl_sync_disp_unit(module, disp_unit, true); + + mca_osc_ubcl_log(20, "Module allocated at %p", (void *) module); + + return OMPI_SUCCESS; +} + +static int win_free(struct ompi_win_t *win) +{ + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + uint64_t wid; + int ret; + + if (UBCL_WIN_SYNC_NONE != module->sync_type && + UBCL_WIN_SYNC_FENCE != module->sync_type) { + ret = OMPI_ERR_RMA_SYNC; + mca_osc_ubcl_warn(ret, "Cannot free window %d: epoch not ended", module->wid); + return ret; + } + + module->comm->c_coll->coll_barrier(module->comm, + module->comm->c_coll->coll_barrier_module); + + wid = module->wid; + ret = ubcl_error_to_ompi(ubcl_win_free(wid)); + + OBJ_DESTRUCT(&module->sync_lock); + ompi_comm_free(&module->comm); + osc_ubcl_fini_disp_unit(module); + free(module->free_after); + free(module->procs_sync_type); + free(module); + + return ret; +} + +static int shared_query(struct ompi_win_t *win, int rank, size_t *size, ptrdiff_t *disp_unit, + void *baseptr) +{ + (void) win; + (void) rank; + *size = 0; + *disp_unit = 0; + *(void **) baseptr = NULL; + + return OMPI_SUCCESS; +} + +static int win_attach(struct ompi_win_t *win, void *base, size_t size) +{ + ubcl_error_t ret; + ubcl_wid_t wid; + mca_osc_ubcl_module_t *module; + uint64_t flags = 0; + int dev_id; + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + wid = (ubcl_wid_t) module->wid; + + /* Accelerator buffer is not supported as attached buffer */ + if (opal_accelerator.check_addr(base, &dev_id, &flags)) { + mca_osc_ubcl_warn(OPAL_ERR_NOT_SUPPORTED, "GPU buffer not supported by osc/ubcl"); + return OPAL_ERR_NOT_SUPPORTED; + } + + ret = ubcl_win_attach(base, size, wid); + + return ubcl_error_to_ompi(ret); +} + +static int win_detach(struct ompi_win_t *win, const void *base) +{ + ubcl_error_t ret; + ubcl_wid_t wid; + mca_osc_ubcl_module_t *module; + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + wid = (ubcl_wid_t) module->wid; + + /* FIXME: get the window size */ + ret = ubcl_win_detach((void *) base, 0, wid); + + return ubcl_error_to_ompi(ret); +} + +int osc_ubcl_build_ddt_iov(const void *addr, ompi_proc_t *proc, size_t count, + ompi_datatype_t *datatype, struct iovec **output_iov, + size_t *output_iov_count) +{ + opal_convertor_t convertor; + int ret; + bool done; + size_t output_iov_pos; + + OBJ_CONSTRUCT(&convertor, opal_convertor_t); + ret = opal_convertor_copy_and_prepare_for_send(proc->super.proc_convertor, &datatype->super, + count, addr, 0, &convertor); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OBJ_DESTRUCT(&convertor); + return ret; + } + + output_iov_pos = 0; + *output_iov_count = 0; + done = false; + do { + size_t length; + uint32_t tmp_iov_count; + size_t tmp_iov_pos; + struct iovec tmp_iov[OSC_UBCL_IOVEC_MAX]; + + tmp_iov_count = OSC_UBCL_IOVEC_MAX; + + done = opal_convertor_raw(&convertor, tmp_iov, &tmp_iov_count, &length); + + *output_iov_count += tmp_iov_count; + *output_iov = (struct iovec *) realloc(*output_iov, + *output_iov_count * sizeof(struct iovec)); + if (NULL == *output_iov) { + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + + tmp_iov_pos = 0; + while (tmp_iov_pos != tmp_iov_count) { + (*output_iov)[output_iov_pos].iov_base = tmp_iov[tmp_iov_pos].iov_base; + (*output_iov)[output_iov_pos].iov_len = tmp_iov[tmp_iov_pos].iov_len; + tmp_iov_pos++; + output_iov_pos++; + } + assert(*output_iov_count == output_iov_pos); + } while (!done); + + OBJ_DESTRUCT(&convertor); + + return ret; +} diff --git a/ompi/mca/osc/ubcl/osc_ubcl.h b/ompi/mca/osc/ubcl/osc_ubcl.h new file mode 100644 index 00000000000..e0e4026dce9 --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl.h @@ -0,0 +1,189 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Bull eXtreme Interconnect OSC API implementation. + * + * Implementation of API defined in osc.h. To see parameters and return values + * of these functions, refer to ompi/mca/osc/osc.h. + */ + +#ifndef MCA_OSC_UBCL_H +#define MCA_OSC_UBCL_H + +#include +#include "ompi/mca/osc/osc.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/mca/osc/base/base.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_sync.h" +#include "ompi/group/group.h" +#include "ompi/communicator/communicator.h" +#include "ompi/request/request.h" +#include "opal/util/show_help.h" +#include "opal/mca/threads/mutex.h" + +#define OSC_UBCL_IOVEC_MAX 128 + +struct mca_osc_ubcl_module_s { + ompi_osc_base_module_t super; + struct ompi_communicator_t *comm; + struct ompi_win_t *win; + int64_t wid; + union {ptrdiff_t *all; ptrdiff_t uniq;} disp_unit; + ubcl_win_flags_t win_flags; + + /* To avoid info access (including locking a list and string manipulations) + * usefull info keys are stored inside the osc module. + * + * Note that string data such as accumulate_ordering and accumulate_ops + * are already stored in dedicated window variables (w_acc_order and w_acc_ops) + */ + uint32_t same_disp_unit:1; + uint32_t no_locks:1; + uint32_t padding_infos:30; + + /* Sync type of the entire window */ + ubcl_win_sync_type_t sync_type; + /* Detail of locked peer, only relevant for win_[un]lock */ + ubcl_win_sync_type_t *procs_sync_type; + /* How many remote locks are currently hold */ + int64_t passive_lock_refcount; + /* Threadsafety for lock syncs + * other types of sync should never be called by concurrent threads */ + opal_mutex_t sync_lock; + + /* Active target management */ + unsigned int nb_rank_waited; + struct ompi_group_t *active_sync_access_group; + struct ompi_group_t *active_sync_exposure_group; + + /* if non-null, this pointer should be free()ed with the window */ + void *free_after; +}; +typedef struct mca_osc_ubcl_module_s mca_osc_ubcl_module_t; + +struct mca_osc_ubcl_component_s { + ompi_osc_base_component_t super; + + /** Functionnal fields **/ + volatile int64_t is_init; /**< Whether we have been initialized, for proper close */ + int output; /**< Output stream */ + + /** MCA parameters **/ + int priority; /**< Priority of the component */ + int verbose; /**< Verbosity level of the component */ + + /** UBCL endpoint type capabilities **/ + opal_free_list_t req_free_list; + unsigned int max_req; /**< Maximum number of requests */ + unsigned int min_req; /**< Minimum (and inititial) number of requests */ + unsigned int incr_req; /**< Increasing (and inititial) number of requests */ + unsigned int pad_req; +}; +typedef struct mca_osc_ubcl_component_s mca_osc_ubcl_component_t; +extern mca_osc_ubcl_component_t mca_osc_ubcl_component; + +/* One Sided operations */ +int ompi_osc_ubcl_put(const void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win); + +int ompi_osc_ubcl_rput(const void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req); + +int ompi_osc_ubcl_get(void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win); + +int ompi_osc_ubcl_rget(void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req); + +int ompi_osc_ubcl_accumulate(const void *origin_addr, size_t origin_count, + struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, + size_t target_count, struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, struct ompi_win_t *win); + +int ompi_osc_ubcl_raccumulate(const void *origin_addr, size_t origin_count, + struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, + size_t target_count, struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, struct ompi_win_t *win, + struct ompi_request_t **ompi_req); + +int ompi_osc_ubcl_get_accumulate(const void *origin_addr, size_t origin_count, + struct ompi_datatype_t *origin_dt, void *result_addr, + size_t result_count, struct ompi_datatype_t *result_dt, + int target_rank, ptrdiff_t target_disp, size_t target_count, + struct ompi_datatype_t *target_dt, struct ompi_op_t *op, + struct ompi_win_t *win); + +int ompi_osc_ubcl_rget_accumulate(const void *origin_addr, size_t origin_count, + struct ompi_datatype_t *origin_dt, void *result_addr, + size_t result_count, struct ompi_datatype_t *result_dt, + int target_rank, ptrdiff_t target_disp, size_t target_count, + struct ompi_datatype_t *target_dt, struct ompi_op_t *op, + struct ompi_win_t *win, struct ompi_request_t **ompi_req); + +int ompi_osc_ubcl_fetch_and_op(const void *origin_addr, void *result_addr, + struct ompi_datatype_t *dt, int target, ptrdiff_t target_disp, + struct ompi_op_t *op, struct ompi_win_t *win); + +int ompi_osc_ubcl_compare_and_swap(const void *origin_addr, const void *compare_addr, + void *result_addr, struct ompi_datatype_t *dt, int target, + ptrdiff_t target_disp, struct ompi_win_t *win); + +/* Sync functions */ +int ompi_osc_ubcl_flush(int target, + struct ompi_win_t *win); +int ompi_osc_ubcl_flush_all(struct ompi_win_t *win); +int ompi_osc_ubcl_flush_local(int target, + struct ompi_win_t *win); +int ompi_osc_ubcl_flush_local_all(struct ompi_win_t *win); + +/* ubcl custom memory descriptor management */ +size_t osc_ubcl_datatype_pack(void *pack_buf, const void *usr_handle, + size_t pack_size, size_t offset); +size_t osc_ubcl_datatype_unpack(void *usr_handle, const void *pack_buf, + size_t pack_size, size_t offset); +size_t osc_ubcl_datatype_mem_size(const void *usr_handle, size_t offset); +void osc_ubcl_datatype_finish(void *usr_handle); + +/* Misc */ +int osc_ubcl_build_ddt_iov(const void *addr, ompi_proc_t *proc, size_t count, + ompi_datatype_t *datatype, struct iovec **output_iov, + size_t *output_iov_count); + +#endif //MCA_OSC_UBCL_H diff --git a/ompi/mca/osc/ubcl/osc_ubcl_accumulate.c b/ompi/mca/osc/ubcl/osc_ubcl_accumulate.c new file mode 100644 index 00000000000..59644e829f3 --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_accumulate.c @@ -0,0 +1,1105 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Bull eXtreme Interconnect OSC API implementation. + * + * Implementation of API defined in osc.h. To see parameters and return values + * of these functions, refer to ompi/mca/osc/osc.h. + */ + +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_info.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_utils.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_sync.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_request.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" + +int get_ubcl_int_type(size_t size, bool is_signed, ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + + switch (size) { + case 1: + *ubcl_type = is_signed ? UBCL_INT8 : UBCL_UINT8; + break; + case 2: + *ubcl_type = is_signed ? UBCL_INT16 : UBCL_UINT16; + break; + case 4: + *ubcl_type = is_signed ? UBCL_INT32 : UBCL_UINT32; + break; + case 8: + *ubcl_type = is_signed ? UBCL_INT64 : UBCL_UINT64; + break; + default: + ret = OMPI_ERR_NOT_SUPPORTED; + break; + } + + return ret; +} + +int get_ubcl_fp_type(size_t size, ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + + switch (size) { + case sizeof(float): + *ubcl_type = UBCL_FLOAT; + break; + case sizeof(double): + *ubcl_type = UBCL_DOUBLE; + break; + case sizeof(long double): + *ubcl_type = UBCL_LONG_DOUBLE; + break; + default: + ret = OMPI_ERR_NOT_SUPPORTED; + break; + } + + return ret; +} + +static int get_c_integer_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + size_t dt_size; + + if (OMPI_SUCCESS != ompi_datatype_type_size(origin_dt, &dt_size)) { + return OMPI_ERR_BAD_PARAM; + } + + if (MPI_CHAR == origin_dt + /* Note: MPI_CHAR is not a valid type for predefined operations + * but MPI_SIGNED_CHAR and MPI_UNSIGNED_CHAR are. + * We suppost MPI_CHAR behaves as MPI_SIGNED_CHAR. + * C.F.: MPI 4.1 section 6.9.2 (p.227) + * MPI 5.0 section 6.9.2 (p.225) + */ + || MPI_INT == origin_dt || MPI_LONG == origin_dt || MPI_SHORT == origin_dt +#if OPAL_HAVE_LONG_LONG + || MPI_LONG_LONG_INT == origin_dt || MPI_LONG_LONG == origin_dt +#endif + || MPI_SIGNED_CHAR == origin_dt || MPI_INT8_T == origin_dt || MPI_INT16_T == origin_dt + || MPI_INT32_T == origin_dt || MPI_INT64_T == origin_dt) { + + ret = get_ubcl_int_type(dt_size, true, ubcl_type); + + } else if (MPI_UNSIGNED_SHORT == origin_dt || MPI_UNSIGNED == origin_dt + || MPI_UNSIGNED_LONG == origin_dt +#if OPAL_HAVE_LONG_LONG + || MPI_UNSIGNED_LONG_LONG == origin_dt +#endif + || MPI_UNSIGNED_CHAR == origin_dt || MPI_UINT8_T == origin_dt + || MPI_UINT16_T == origin_dt || MPI_UINT32_T == origin_dt + || MPI_UINT64_T == origin_dt) { + + ret = get_ubcl_int_type(dt_size, false, ubcl_type); + + } else { + ret = OMPI_ERR_BAD_PARAM; + } + return ret; +} + +static int get_fortran_integer_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + size_t dt_size; + + if (OMPI_SUCCESS != ompi_datatype_type_size(origin_dt, &dt_size)) { + return OMPI_ERR_BAD_PARAM; + } + + if (MPI_INTEGER == origin_dt +#if OMPI_HAVE_FORTRAN_INTEGER1 + || MPI_INTEGER1 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_INTEGER2 + || MPI_INTEGER2 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_INTEGER4 + || MPI_INTEGER4 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_INTEGER8 + || MPI_INTEGER8 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_INTEGER16 + || MPI_INTEGER16 == origin_dt +#endif + ) { + ret = get_ubcl_int_type(dt_size, true, ubcl_type); + } else { + ret = OMPI_ERR_BAD_PARAM; + } + return ret; +} + +static int get_fp_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + /* TODO: handle MPI_TYPE_CREATE_F90_REAL handles */ + int ret = OMPI_SUCCESS; + size_t dt_size; + + if (OMPI_SUCCESS != ompi_datatype_type_size(origin_dt, &dt_size)) { + return OMPI_ERR_BAD_PARAM; + } + + if (MPI_FLOAT == origin_dt || MPI_DOUBLE == origin_dt || MPI_REAL == origin_dt + || MPI_DOUBLE_PRECISION == origin_dt || MPI_LONG_DOUBLE == origin_dt +/*#if OMPI_HAVE_FORTRAN_REAL2 + * || MPI_REAL2 == origin_dt + *#endif */ +#if OMPI_HAVE_FORTRAN_REAL4 + || MPI_REAL4 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_REAL8 + || MPI_REAL8 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_REAL16 + || MPI_REAL16 == origin_dt +#endif + ) { + ret = get_ubcl_fp_type(dt_size, ubcl_type); + } else { + ret = OMPI_ERR_BAD_PARAM; + } + return ret; +} + +static int get_logical_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + size_t dt_size; + + if (OMPI_SUCCESS != ompi_datatype_type_size(origin_dt, &dt_size)) { + return OMPI_ERR_BAD_PARAM; + } + + /* Some consideration are needed to take care of fortran logical + * Yet not we dit + */ + if (MPI_C_BOOL == origin_dt || MPI_CXX_BOOL == origin_dt) { + ret = get_ubcl_int_type(dt_size, false, ubcl_type); + } else if (MPI_LOGICAL == origin_dt +#if OMPI_HAVE_FORTRAN_LOGICAL1 + || MPI_LOGICAL1 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_LOGICAL2 + || MPI_LOGICAL2 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_LOGICAL4 + || MPI_LOGICAL4 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_LOGICAL8 + || MPI_LOGICAL8 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_LOGICAL16 + || MPI_LOGICAL16 == origin_dt +#endif + ) { + ret = OMPI_ERR_NOT_IMPLEMENTED; + } else { + ret = OMPI_ERR_BAD_PARAM; + } + return ret; +} + +static int get_complex_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + if (MPI_COMPLEX == origin_dt +#if HAVE_FLOAT__COMPLEX + || MPI_C_COMPLEX == origin_dt || MPI_C_FLOAT_COMPLEX == origin_dt +#endif +#if HAVE_DOUBLE__COMPLEX + || MPI_C_DOUBLE_COMPLEX == origin_dt +#endif +#if HAVE_LONG_DOUBLE__COMPLEX + || MPI_C_LONG_DOUBLE_COMPLEX == origin_dt +#endif + || MPI_CXX_FLOAT_COMPLEX == origin_dt || MPI_CXX_DOUBLE_COMPLEX == origin_dt + || MPI_CXX_LONG_DOUBLE_COMPLEX == origin_dt || MPI_DOUBLE_COMPLEX == origin_dt +/*#if OMPI_HAVE_FORTRAN_REAL2 + * || MPI_COMPLEX4 == origin_dt + *#endif */ +#if OMPI_HAVE_FORTRAN_REAL4 + || MPI_COMPLEX8 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_REAL8 + || MPI_COMPLEX16 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_REAL16 + || MPI_COMPLEX32 == origin_dt +#endif + ) { + ret = OMPI_ERR_NOT_IMPLEMENTED; + } else { + ret = OMPI_ERR_BAD_PARAM; + } + return ret; +} + +static int get_byte_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + + if (MPI_BYTE == origin_dt) { + *ubcl_type = UBCL_UINT8; + } else { + ret = OMPI_ERR_BAD_PARAM; + } + + return ret; +} + +static int get_multi_language_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + size_t dt_size; + + if (OMPI_SUCCESS != ompi_datatype_type_size(origin_dt, &dt_size)) { + return OMPI_ERR_BAD_PARAM; + } + + if (MPI_AINT == origin_dt || MPI_OFFSET == origin_dt || MPI_COUNT == origin_dt) { + ret = get_ubcl_int_type(dt_size, true, ubcl_type); + } else { + ret = OMPI_ERR_BAD_PARAM; + } + + return ret; +} + +static int get_pair_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + size_t dt_size; + + if (OMPI_SUCCESS != ompi_datatype_type_size(origin_dt, &dt_size)) { + return OMPI_ERR_BAD_PARAM; + } + + if (MPI_FLOAT_INT == origin_dt) { + *ubcl_type = UBCL_FLOAT; + } else if (MPI_DOUBLE_INT == origin_dt) { + *ubcl_type = UBCL_DOUBLE; + } else if (MPI_LONG_DOUBLE_INT == origin_dt) { + *ubcl_type = UBCL_LONG_DOUBLE; + } else if (MPI_LONG_INT == origin_dt) { + ret = get_ubcl_int_type(sizeof(long), true, ubcl_type); + } else if (MPI_SHORT_INT == origin_dt) { + ret = get_ubcl_int_type(sizeof(short), true, ubcl_type); + } else if (MPI_2INT == origin_dt) { + ret = get_ubcl_int_type(sizeof(int), true, ubcl_type); + } else if (MPI_2REAL == origin_dt || MPI_2DOUBLE_PRECISION == origin_dt) { + ret = get_ubcl_fp_type(dt_size, ubcl_type); + } else if (MPI_2INTEGER == origin_dt) { + ret = get_ubcl_int_type(dt_size, true, ubcl_type); + } else { + ret = OMPI_ERR_BAD_PARAM; + } + + return ret; +} + +static int build_ubcl_loc_op(struct ompi_datatype_t *origin_dt, struct ompi_op_t *op, + ubcl_win_op_t *ubcl_op) +{ + ubcl_win_atomic_operator_t ubcl_operator; + ubcl_win_atomic_datatype_t data_type; + ubcl_win_atomic_datatype_t loc_type; + int ret = OMPI_SUCCESS; + + if (MPI_MAXLOC == op) { + ubcl_operator = UBCL_MAXLOC; + } else if (MPI_MINLOC == op) { + ubcl_operator = UBCL_MINLOC; + } else { + return OMPI_ERR_BAD_PARAM; + } + + ret = get_ubcl_int_type(sizeof(int), true, &loc_type); + if (OMPI_SUCCESS != ret) { + return ret; + } + + ret = get_pair_ubcl_type(origin_dt, &data_type); + if (OMPI_SUCCESS != ret) { + return ret; + } + + return ubcl_error_to_ompi(ubcl_win_build_loc_op(ubcl_op, data_type, ubcl_operator, loc_type)); +} + +#define GET_TYPE(fct, origin_dt, data_type) \ + do { \ + int _err = fct(origin_dt, data_type); \ + if (OMPI_SUCCESS == _err) { \ + goto got_type; \ + } else if (OMPI_ERR_NOT_IMPLEMENTED == _err) { \ + goto not_implemented; \ + } \ + } while (0) + +static int build_ubcl_minmax_op(struct ompi_datatype_t *origin_dt, struct ompi_op_t *op, + ubcl_win_op_t *ubcl_op) +{ + ubcl_win_atomic_operator_t ubcl_operator; + ubcl_win_atomic_datatype_t data_type = UBCL_TYPE_NONE; + + if (MPI_MAX == op) { + ubcl_operator = UBCL_MAX; + } else if (MPI_MIN == op) { + ubcl_operator = UBCL_MAX; + } else { + return OMPI_ERR_BAD_PARAM; + } + + /* This macro calls goto on got_type or not_implemented labels if the + * datatype is one of the predifines one of this category. + */ + GET_TYPE(get_c_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_fortran_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_fp_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_multi_language_ubcl_type, origin_dt, &data_type); + + return OMPI_ERR_BAD_PARAM; + +got_type: + return ubcl_error_to_ompi(ubcl_win_build_op(ubcl_op, data_type, ubcl_operator)); + +not_implemented: + return OMPI_ERR_NOT_IMPLEMENTED; +} + +static int build_ubcl_arithmetic_op(struct ompi_datatype_t *origin_dt, struct ompi_op_t *op, + ubcl_win_op_t *ubcl_op) +{ + ubcl_win_atomic_operator_t ubcl_operator; + ubcl_win_atomic_datatype_t data_type = UBCL_TYPE_NONE; + + if (MPI_SUM == op) { + ubcl_operator = UBCL_SUM; + } else if (MPI_PROD == op) { + ubcl_operator = UBCL_PROD; + } else { + return OMPI_ERR_BAD_PARAM; + } + + /* This macro calls goto on got_type or not_implemented labels if the + * datatype is one of the predifines one of this category. + */ + GET_TYPE(get_c_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_fortran_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_fp_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_complex_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_multi_language_ubcl_type, origin_dt, &data_type); + + return OMPI_ERR_BAD_PARAM; + +got_type: + return ubcl_error_to_ompi(ubcl_win_build_op(ubcl_op, data_type, ubcl_operator)); + +not_implemented: + return OMPI_ERR_NOT_IMPLEMENTED; +} + +static int build_ubcl_logical_op(struct ompi_datatype_t *origin_dt, struct ompi_op_t *op, + ubcl_win_op_t *ubcl_op) +{ + ubcl_win_atomic_operator_t ubcl_operator; + ubcl_win_atomic_datatype_t data_type = UBCL_TYPE_NONE; + + if (MPI_LAND == op) { + ubcl_operator = UBCL_LAND; + } else if (MPI_LOR == op) { + ubcl_operator = UBCL_LOR; + } else if (MPI_LXOR == op) { + ubcl_operator = UBCL_LXOR; + } else { + return OMPI_ERR_BAD_PARAM; + } + + /* This macro calls goto on got_type or not_implemented labels if the + * datatype is one of the predifines one of this category. + */ + GET_TYPE(get_c_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_logical_ubcl_type, origin_dt, &data_type); + + return OMPI_ERR_BAD_PARAM; + +got_type: + return ubcl_error_to_ompi(ubcl_win_build_op(ubcl_op, data_type, ubcl_operator)); + +not_implemented: + return OMPI_ERR_NOT_IMPLEMENTED; +} + +static int build_ubcl_bitwise_op(struct ompi_datatype_t *origin_dt, struct ompi_op_t *op, + ubcl_win_op_t *ubcl_op) +{ + ubcl_win_atomic_operator_t ubcl_operator; + ubcl_win_atomic_datatype_t data_type = UBCL_TYPE_NONE; + + if (MPI_BAND == op) { + ubcl_operator = UBCL_BAND; + } else if (MPI_BOR == op) { + ubcl_operator = UBCL_BOR; + } else if (MPI_BXOR == op) { + ubcl_operator = UBCL_BXOR; + } else { + return OMPI_ERR_BAD_PARAM; + } + + /* This macro calls goto on got_type or not_implemented labels if the + * datatype is one of the predifines one of this category. + */ + GET_TYPE(get_c_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_fortran_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_byte_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_multi_language_ubcl_type, origin_dt, &data_type); + + return OMPI_ERR_BAD_PARAM; + +got_type: + return ubcl_error_to_ompi(ubcl_win_build_op(ubcl_op, data_type, ubcl_operator)); + +not_implemented: + return OMPI_ERR_NOT_IMPLEMENTED; +} + +static int build_ubcl_fake_op(struct ompi_datatype_t *origin_dt, struct ompi_op_t *op, + ubcl_win_op_t *ubcl_op) +{ + int ret = OMPI_SUCCESS; + ubcl_win_atomic_operator_t ubcl_operator; + ubcl_win_atomic_datatype_t data_type = UBCL_TYPE_NONE; + ubcl_win_atomic_datatype_t loc_type = UBCL_TYPE_NONE; + + if (MPI_REPLACE == op) { + ubcl_operator = UBCL_REPLACE; + } else if (MPI_NO_OP == op) { + ubcl_operator = UBCL_NO_OP; + } else { + return OMPI_ERR_BAD_PARAM; + } + + /* This macro calls goto on got_type or not_implemented labels if the + * datatype is one of the predifines one of this category. + */ + GET_TYPE(get_c_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_fortran_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_fp_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_logical_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_complex_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_byte_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_multi_language_ubcl_type, origin_dt, &data_type); + + ret = get_pair_ubcl_type(origin_dt, &data_type); + if (OMPI_SUCCESS != ret) { + return ret; + } + ret = get_ubcl_int_type(sizeof(int), true, &loc_type); + if (OMPI_SUCCESS != ret) { + return ret; + } + + +got_type: + return ubcl_error_to_ompi(ubcl_win_build_loc_op(ubcl_op, data_type, ubcl_operator, loc_type)); + +not_implemented: + return OMPI_ERR_NOT_IMPLEMENTED; +} + +static int build_ubcl_op(struct ompi_datatype_t *origin_dt, struct ompi_op_t *op, + ubcl_win_op_t *ubcl_op) +{ + if (MPI_MAXLOC == op || MPI_MINLOC == op) { + return build_ubcl_loc_op(origin_dt, op, ubcl_op); + } else if (MPI_MAX == op || MPI_MIN == op) { + return build_ubcl_minmax_op(origin_dt, op, ubcl_op); + } else if (MPI_SUM == op || MPI_PROD == op) { + return build_ubcl_arithmetic_op(origin_dt, op, ubcl_op); + } else if (MPI_LAND == op || MPI_LOR == op || MPI_LXOR == op) { + return build_ubcl_logical_op(origin_dt, op, ubcl_op); + } else if (MPI_BAND == op || MPI_BOR == op || MPI_BXOR == op) { + return build_ubcl_bitwise_op(origin_dt, op, ubcl_op); + } else if (MPI_REPLACE == op || MPI_NO_OP == op) { + return build_ubcl_fake_op(origin_dt, op, ubcl_op); + } else { + return OMPI_ERR_BAD_PARAM; + } +} + +int ompi_osc_ubcl_accumulate(const void *origin_addr, size_t origin_count, + struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, + size_t target_count, struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, struct ompi_win_t *win) +{ + return ompi_osc_ubcl_raccumulate(origin_addr, origin_count, origin_dt, target, target_disp, + target_count, target_dt, op, win, NULL); +} + +int ompi_osc_ubcl_raccumulate(const void *origin_addr, size_t origin_count, + struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, + size_t target_count, struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + return ompi_osc_ubcl_rget_accumulate(origin_addr, origin_count, origin_dt, NULL, 0, NULL, + target, target_disp, target_count, target_dt, op, win, + ompi_req); +} + +int ompi_osc_ubcl_get_accumulate(const void *origin_addr, size_t origin_count, + struct ompi_datatype_t *origin_dt, void *result_addr, + size_t result_count, struct ompi_datatype_t *result_dt, + int target_rank, ptrdiff_t target_disp, size_t target_count, + struct ompi_datatype_t *target_dt, struct ompi_op_t *op, + struct ompi_win_t *win) +{ + return ompi_osc_ubcl_rget_accumulate(origin_addr, origin_count, origin_dt, result_addr, + result_count, result_dt, target_rank, target_disp, + target_count, target_dt, op, win, NULL); +} + +static int compute_aligned_iovecs_count(struct iovec **iovecs, size_t *iovecs_count, + int iovecs_to_align, size_t *aligned_iovec_count) +{ + size_t segment[iovecs_to_align]; + size_t consumed_size[iovecs_to_align]; + size_t aligned_count = 0; + + for (int i = 0; i < iovecs_to_align; i++) { + segment[i] = 0; + consumed_size[i] = 0; + } + + /* Stop when we reach the end of one iovec */ + while (true) { + size_t min_remaining_size = UINT64_MAX; + + /* Get the minimum remaining size */ + for (int i = 0; i < iovecs_to_align; i++) { + if (segment[i] >= iovecs_count[i]) { + goto end_compute_aligned_count; + } + + size_t remaining_size = iovecs[i][segment[i]].iov_len - consumed_size[i]; + + if (remaining_size < min_remaining_size) { + min_remaining_size = remaining_size; + } + } + + /* Consume size */ + for (int i = 0; i < iovecs_to_align; i++) { + consumed_size[i] += min_remaining_size; + + if (consumed_size[i] == iovecs[i][segment[i]].iov_len) { + consumed_size[i] = 0; + segment[i]++; + } + } + + aligned_count++; + } +end_compute_aligned_count: + + /* The send buffer must fit in the target buffer and the target buffer must fit + * in the fetch buffer so the send buffer must be the smallest and all its segments + * must have been consumed + */ + if (segment[0] < iovecs_count[0]) { + return OMPI_ERROR; + } + + *aligned_iovec_count = aligned_count; + + return OMPI_SUCCESS; +} + +static void compute_aligned_iovecs(struct iovec **iovecs, size_t *iovecs_count, int iovecs_to_align, + struct iovec **aligned_iovecs, size_t aligned_iovec_count) +{ + size_t segment[iovecs_to_align]; + size_t consumed_size[iovecs_to_align]; + + /* Run through iovecs a second time to fill aligned_iovecs */ + for (int i = 0; i < iovecs_to_align; i++) { + segment[i] = 0; + consumed_size[i] = 0; + } + + for (size_t seg = 0; seg < aligned_iovec_count; seg++) { + size_t min_remaining_size = UINT64_MAX; + + /* Get the minimum remaining size */ + for (int i = 0; i < iovecs_to_align; i++) { + size_t remaining_size = iovecs[i][segment[i]].iov_len - consumed_size[i]; + + if (remaining_size < min_remaining_size) { + min_remaining_size = remaining_size; + } + } + + /* Consume size */ + for (int i = 0; i < iovecs_to_align; i++) { + aligned_iovecs[i][seg].iov_base = (char *) iovecs[i][segment[i]].iov_base + + consumed_size[i]; + aligned_iovecs[i][seg].iov_len = min_remaining_size; + + consumed_size[i] += min_remaining_size; + + if (consumed_size[i] == iovecs[i][segment[i]].iov_len) { + consumed_size[i] = 0; + segment[i]++; + } + } + } +} + +/* This function takes an array of iovec arrays with an arbitrary fragmentation, + * and allocates a new array of iovec arrays describing the same memory areas but + * potentially splitted in smaller segments. + * + * All the returned iovec have the same count of fragments and the i-th element + * have the same length on each of it. + * + * If input iovec arrays have different total length, they must be provided in + * total length increasing order. + * In this case, iovecs are truncated according to the smallest one. + * An error is raised if the smallest one is not the first one. + */ +static int align_iovecs(struct iovec **iovecs, size_t *iovecs_count, int iovecs_to_align, + struct iovec **aligned_iovecs, size_t *aligned_iovec_count) +{ + size_t aligned_count = 0; + int ret; + + ret = compute_aligned_iovecs_count(iovecs, iovecs_count, iovecs_to_align, &aligned_count); + if (OMPI_SUCCESS != ret) { + return ret; + } + + /* Allocate aligned_iovecs */ + for (int i = 0; i < iovecs_to_align; i++) { + aligned_iovecs[i] = (struct iovec *) malloc(aligned_count * sizeof(struct iovec)); + } + + compute_aligned_iovecs(iovecs, iovecs_count, iovecs_to_align, aligned_iovecs, aligned_count); + + *aligned_iovec_count = aligned_count; + + return OMPI_SUCCESS; +} + +static struct ompi_datatype_t *segmented_rget_get_base_datatype(struct ompi_datatype_t *origin_dt, + struct ompi_datatype_t *target_dt, + struct ompi_datatype_t *result_dt, + struct ompi_op_t *op) +{ + struct ompi_datatype_t *base_datatype; + + /* Get predefined datatype used to build target_dt */ + base_datatype = ompi_datatype_get_single_predefined_type_from_args(target_dt); + if (NULL == base_datatype) { + /* Null means more than one, not allowed */ + return NULL; + } + + /* Ensure origin_dt and result_dt are made in the same wood as target_dt */ + if (MPI_NO_OP != op + && base_datatype != ompi_datatype_get_single_predefined_type_from_args(origin_dt)) { + return NULL; + } + if (NULL != result_dt + && base_datatype != ompi_datatype_get_single_predefined_type_from_args(result_dt)) { + return NULL; + } + + return base_datatype; +} + +static int segmented_rget_build_aligned_iovecs( + const void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, void *result_addr, + size_t result_count, struct ompi_datatype_t *result_dt, int target_rank, ptrdiff_t target_disp, + size_t target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, ompi_proc_t *proc, + mca_osc_ubcl_module_t *module, struct iovec *aligned_iovec[3], size_t *aligned_iovec_count) +{ + int ret; + ptrdiff_t disp_unit; + struct iovec *base_iovec[3] = {NULL, NULL, NULL}; + size_t base_iovec_count[3] = {0, 0, 0}; + disp_unit = osc_ubcl_get_disp_unit(module, target_rank); + + if (MPI_NO_OP != op) { + /* Build origin iovec based on origin addr/count/datatype */ + ret = osc_ubcl_build_ddt_iov(origin_addr, proc, origin_count, origin_dt, &base_iovec[0], + &base_iovec_count[0]); + if (OMPI_SUCCESS != ret) { + goto error; + } + } + + /* Build target iovec with relative offsets in the target window */ + ret = osc_ubcl_build_ddt_iov((void *) (target_disp * disp_unit), proc, target_count, target_dt, + &base_iovec[1], &base_iovec_count[1]); + if (OMPI_SUCCESS != ret) { + goto error; + } + + if (NULL != result_dt) { + /* Build result iovec based on result addr/count/datatype */ + ret = osc_ubcl_build_ddt_iov(result_addr, proc, result_count, result_dt, &base_iovec[2], + &base_iovec_count[2]); + if (OMPI_SUCCESS != ret) { + goto error; + } + if (MPI_NO_OP == op) { + /* No origin iovec to align */ + ret = align_iovecs(&base_iovec[1], &base_iovec_count[1], 2, &aligned_iovec[1], + aligned_iovec_count); + } else { + ret = align_iovecs(base_iovec, base_iovec_count, 3, aligned_iovec, aligned_iovec_count); + } + + /* TODO: compute additionnal no_op segments if target buffer is larger than origin buffer */ + } else { + if (MPI_NO_OP == op) { + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + /* No result iovec to align */ + ret = align_iovecs(base_iovec, base_iovec_count, 2, aligned_iovec, aligned_iovec_count); + } + + if (OMPI_SUCCESS != ret) { + goto error; + } + + ret = OMPI_SUCCESS; + +error: + free(base_iovec[0]); + free(base_iovec[1]); + free(base_iovec[2]); + return ret; +} + +static int segmented_rget_accumulate(const void *origin_addr, size_t origin_count, + struct ompi_datatype_t *origin_dt, void *result_addr, + size_t result_count, struct ompi_datatype_t *result_dt, + int target_rank, ptrdiff_t target_disp, size_t target_count, + struct ompi_datatype_t *target_dt, struct ompi_op_t *op, + struct ompi_win_t *win, struct ompi_request_t **ompi_req) +{ + int ret; + mca_osc_ubcl_module_t *module; + ompi_proc_t *proc; + mca_common_ubcl_endpoint_t *endpoint; + struct iovec *aligned_iovec[3] = {NULL, NULL, NULL}; + size_t aligned_iovec_count; + struct ompi_datatype_t *base_datatype; + ubcl_win_op_t ubcl_op; + mca_osc_ubcl_request_t *req; + ubcl_completion_callback_fct cb; + void *cb_data; + size_t base_dt_size; + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + /* Get base datatype to build operation */ + base_datatype = segmented_rget_get_base_datatype(origin_dt, target_dt, result_dt, op); + if (NULL == base_datatype) { + return OMPI_ERR_BAD_PARAM; + } + ret = ompi_datatype_type_size(base_datatype, &base_dt_size); + if (OMPI_SUCCESS != ret) { + goto error; + } + ret = build_ubcl_op(base_datatype, op, &ubcl_op); + if (OMPI_SUCCESS != ret) { + goto error; + } + + /* Get proc */ + proc = ompi_group_peer_lookup(win->w_group, target_rank); + if (OPAL_UNLIKELY(NULL == proc)) { + return OMPI_ERR_BAD_PARAM; + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + + /* Compute accumulate segmentation into contiguous parts */ + ret = segmented_rget_build_aligned_iovecs(origin_addr, origin_count, origin_dt, result_addr, + result_count, result_dt, target_rank, target_disp, + target_count, target_dt, op, proc, module, + aligned_iovec, &aligned_iovec_count); + if (OMPI_SUCCESS != ret) { + goto error; + } + + /* Build the request if needed */ + if (NULL == ompi_req) { + req = NULL; + cb = NULL; + cb_data = NULL; + } else { + req = (mca_osc_ubcl_request_t *) opal_free_list_get(&mca_osc_ubcl_component.req_free_list); + if (OPAL_UNLIKELY(NULL == req)) { + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto error; + } + + MCA_OSC_UBCL_REQUEST_INIT(req, target_rank, (struct ompi_datatype_t *) NULL, + (struct ompi_datatype_t *) NULL, win, true); + + *ompi_req = &req->ompi_req; + cb = ubcl_request_complete_cb; + cb_data = req; + req->segment_count = aligned_iovec_count; + } + + for (size_t i = 0; i < aligned_iovec_count; i++) { + void *sbuf; + void *fetch_buf; + ptrdiff_t offset; + size_t count; + ubcl_error_t err; + + /* Check if there is data to send */ + if (MPI_NO_OP == op) { + sbuf = NULL; + } else { + sbuf = aligned_iovec[0][i].iov_base; + assert(aligned_iovec[0][i].iov_len == aligned_iovec[1][i].iov_len); + } + + /* Target buffer offsetn in bytes, relative to the window base */ + offset = (ptrdiff_t) aligned_iovec[1][i].iov_base; + + /* Check if there is data to fetch */ + if (NULL != result_dt) { + fetch_buf = aligned_iovec[2][i].iov_base; + assert(aligned_iovec[1][i].iov_len == aligned_iovec[2][i].iov_len); + } else { + fetch_buf = NULL; + } + + /* Count in terms of base datatypes in this segment */ + count = aligned_iovec[1][i].iov_len / base_dt_size; + + /* Submit contiguous operation to ubcl */ + err = ubcl_accumulate(sbuf, fetch_buf, count, endpoint->rank, offset, &ubcl_op, module->wid, + cb, cb_data); + ret = ubcl_error_to_ompi(err); + if (OMPI_SUCCESS != ret) { + if (0 == i && NULL != req) { + /* This is the first segment, we can have a clean fail */ + opal_free_list_return(&mca_osc_ubcl_component.req_free_list, &req->super); + } else { + /* Some segments have already been sent, we are in a really bad satuation */ + mca_osc_ubcl_error(ret, + "Fail to send fragment %zu in an accumulate " + "operation segmented in %zu parts. " + "This error is not recoverable\n", + i, aligned_iovec_count); + } + goto error; + } + } + + ret = OMPI_SUCCESS; + +error: + free(aligned_iovec[0]); + free(aligned_iovec[1]); + free(aligned_iovec[2]); + return ret; +} + +int ompi_osc_ubcl_rget_accumulate(const void *origin_addr, size_t origin_count, + struct ompi_datatype_t *origin_dt, void *result_addr, + size_t result_count, struct ompi_datatype_t *result_dt, + int target_rank, ptrdiff_t target_disp, size_t target_count, + struct ompi_datatype_t *target_dt, struct ompi_op_t *op, + struct ompi_win_t *win, struct ompi_request_t **ompi_req) +{ + ubcl_error_t err; + int ret; + ptrdiff_t disp_unit; + mca_common_ubcl_endpoint_t *endpoint; + mca_osc_ubcl_module_t *module; + ompi_proc_t *proc; + ubcl_win_op_t ubcl_op; + ptrdiff_t remote_offset; + ubcl_completion_callback_fct cb; + void *cb_data; + struct ompi_datatype_t *dt; + size_t count; + ptrdiff_t origin_size; + ptrdiff_t target_size; + ptrdiff_t gap; + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + disp_unit = osc_ubcl_get_disp_unit(module, target_rank); + + if (MPI_NO_OP != op) { + origin_size = opal_datatype_span((const opal_datatype_t *) origin_dt, origin_count, &gap); + } else { + origin_size = 0; + } + target_size = opal_datatype_span((const opal_datatype_t *) target_dt, target_count, &gap); + (void) gap; + + if (0 == target_size || (NULL == result_dt && 0 == origin_size)) { + if (NULL != ompi_req) { + *ompi_req = &ompi_request_empty; + } + return OMPI_SUCCESS; + } + + if ((MPI_NO_OP != op && !ompi_datatype_is_predefined(origin_dt)) + || !ompi_datatype_is_predefined(target_dt) + || (NULL != result_dt && !ompi_datatype_is_predefined(result_dt))) { + /* Let's take the hard way */ + return segmented_rget_accumulate(origin_addr, origin_count, origin_dt, result_addr, + result_count, result_dt, target_rank, target_disp, + target_count, target_dt, op, win, ompi_req); + } + + /* Get proc */ + proc = ompi_group_peer_lookup(win->w_group, target_rank); + if (OPAL_UNLIKELY(NULL == proc)) { + return OMPI_ERR_BAD_PARAM; + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + if (OMPI_SUCCESS != ompi_osc_ubcl_check_access_epoch(target_rank, win)) { + return OMPI_ERR_RMA_CONFLICT; + } + + if (MPI_NO_OP == op) { + dt = target_dt; + count = target_count; + origin_addr = NULL; + } else { + dt = origin_dt; + count = origin_count; + } + + ret = build_ubcl_op(dt, op, &ubcl_op); + if (OMPI_SUCCESS != ret) { + return ret; + } + + if (NULL == ompi_req) { + cb = NULL; + cb_data = NULL; + } else { + mca_osc_ubcl_request_t *req; + req = (mca_osc_ubcl_request_t *) opal_free_list_get(&mca_osc_ubcl_component.req_free_list); + if (OPAL_UNLIKELY(NULL == req)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + MCA_OSC_UBCL_REQUEST_INIT(req, target_rank, (struct ompi_datatype_t *) NULL, + (struct ompi_datatype_t *) NULL, win, true); + + *ompi_req = &req->ompi_req; + cb = ubcl_request_complete_cb; + cb_data = req; + } + + remote_offset = target_disp * disp_unit; + + /* TODO: handle non contiguous datatypes as MPI seems to allow some of it */ + err = ubcl_accumulate((void *) origin_addr, result_addr, count, endpoint->rank, + remote_offset, &ubcl_op, module->wid, cb, cb_data); + + ret = ubcl_error_to_ompi(err); + + if (OMPI_SUCCESS != ret && NULL != cb_data) { + opal_free_list_return(&mca_osc_ubcl_component.req_free_list, cb_data); + } + + return ret; +} + +int ompi_osc_ubcl_fetch_and_op(const void *origin_addr, void *result_addr, + struct ompi_datatype_t *dt, int target, ptrdiff_t target_disp, + struct ompi_op_t *op, struct ompi_win_t *win) +{ + if (! ompi_datatype_is_predefined(dt)) { + return OMPI_ERR_BAD_PARAM; + } + return ompi_osc_ubcl_get_accumulate(origin_addr, 1, dt, result_addr, 1, dt, target, target_disp, + 1, dt, op, win); +} + +int ompi_osc_ubcl_compare_and_swap(const void *origin_addr, const void *compare_addr, + void *result_addr, struct ompi_datatype_t *dt, int target, + ptrdiff_t target_disp, struct ompi_win_t *win) +{ + ubcl_win_atomic_datatype_t data_type = UBCL_TYPE_NONE; + ptrdiff_t disp_unit; + mca_osc_ubcl_module_t *module; + ompi_proc_t *proc; + mca_common_ubcl_endpoint_t *endpoint; + ubcl_error_t err; + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + disp_unit = osc_ubcl_get_disp_unit(module, target); + + GET_TYPE(get_c_integer_ubcl_type, dt, &data_type); + GET_TYPE(get_fortran_integer_ubcl_type, dt, &data_type); + GET_TYPE(get_logical_ubcl_type, dt, &data_type); + GET_TYPE(get_byte_ubcl_type, dt, &data_type); + GET_TYPE(get_multi_language_ubcl_type, dt, &data_type); + + return OMPI_ERR_BAD_PARAM; + +got_type: + /* Get proc */ + proc = ompi_group_peer_lookup(win->w_group, target); + if (OPAL_UNLIKELY(NULL == proc)) { + return OMPI_ERR_BAD_PARAM; + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + if (OMPI_SUCCESS != ompi_osc_ubcl_check_access_epoch(target, win)) { + return OMPI_ERR_RMA_CONFLICT; + } + + err = ubcl_cas(origin_addr, compare_addr, result_addr, data_type, endpoint->rank, + target_disp * disp_unit, module->wid, NULL, NULL); + + return ubcl_error_to_ompi(err); + +not_implemented: + return OMPI_ERR_NOT_IMPLEMENTED; +} diff --git a/ompi/mca/osc/ubcl/osc_ubcl_datatype.c b/ompi/mca/osc/ubcl/osc_ubcl_datatype.c new file mode 100644 index 00000000000..b81bbacf3dc --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_datatype.c @@ -0,0 +1,86 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * OSC/UBCL datatype and convertor related functions + * + */ + +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_utils.h" + +size_t osc_ubcl_datatype_pack(void *pack_buf, const void *usr_handle, size_t pack_size, + size_t offset) +{ + opal_convertor_t *convertor = (opal_convertor_t *) usr_handle; + + /* Set input data size and start pointer. */ + uint32_t iov_count = 1; + int ret = 0; + struct iovec iov; + iov.iov_len = pack_size; + iov.iov_base = (IOVBASE_TYPE *) pack_buf; + + opal_convertor_set_position(convertor, &offset); + + /* Pack data from converter to iov */ + ret = opal_convertor_pack(convertor, &iov, &iov_count, &pack_size); + if (-1 == ret) { + mca_osc_ubcl_error(ret, "opal_convertor_unpack failed\n"); + } + + return pack_size; +} + +size_t osc_ubcl_datatype_unpack(void *usr_handle, const void *pack_buf, size_t pack_size, + size_t offset) +{ + opal_convertor_t *convertor = (opal_convertor_t *) usr_handle; + + /* Set input data size and start pointer. */ + uint32_t iov_count = 1; + int ret = 0; + struct iovec iov; + iov.iov_len = pack_size; + iov.iov_base = (IOVBASE_TYPE *) pack_buf; + + opal_convertor_set_position(convertor, &offset); + + /* Pack data from converter to iov */ + ret = opal_convertor_unpack(convertor, &iov, &iov_count, &pack_size); + if (-1 == ret) { + mca_osc_ubcl_error(ret, "opal_convertor_unpack failed\n"); + } + + return pack_size; +} + +size_t osc_ubcl_datatype_mem_size(const void *usr_handle, size_t offset) +{ + opal_convertor_t *convertor = (opal_convertor_t *) usr_handle; + size_t size = 0; + + opal_datatype_type_size(convertor->pDesc, &size); + + if (offset > size * convertor->count) { + return 0; + } + + return size * convertor->count - offset; +} + +void osc_ubcl_datatype_finish(void *usr_handle) +{ + (void) usr_handle; + return; +} diff --git a/ompi/mca/osc/ubcl/osc_ubcl_get.c b/ompi/mca/osc/ubcl/osc_ubcl_get.c new file mode 100644 index 00000000000..6dd42ef4240 --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_get.c @@ -0,0 +1,167 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_info.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_request.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_utils.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" + +int ompi_osc_ubcl_get(void *origin_addr, size_t origin_count, + struct ompi_datatype_t *origin_dt, int target, + ptrdiff_t target_disp, size_t target_count, + struct ompi_datatype_t *target_dt, struct ompi_win_t *win) +{ + return ompi_osc_ubcl_rget(origin_addr, origin_count, origin_dt, + target, target_disp, target_count, target_dt, + win, NULL); +} + +int ompi_osc_ubcl_rget(void *origin_addr, size_t origin_count, + struct ompi_datatype_t *origin_dt, int target, + ptrdiff_t target_disp, size_t target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + ubcl_error_t err = 0; + int ret = OMPI_SUCCESS; + ptrdiff_t disp_unit; + ptrdiff_t gap; + size_t span; + size_t target_span; + size_t target_iov_count; + struct iovec *target_iov; + void *target_addr; + mca_common_ubcl_endpoint_t *endpoint; + ubcl_memory_descriptor_t sbuf_md; + mca_osc_ubcl_module_t *module; + mca_osc_ubcl_request_t *osc_req; + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + disp_unit = osc_ubcl_get_disp_unit(module, target); + + OPAL_OUTPUT_VERBOSE( + (50, mca_osc_ubcl_component.output, "UBCL_OSC_GET to window %lu\n", module->wid)); + + /* Get proc */ + ompi_proc_t *proc; + proc = ompi_group_peer_lookup_existing(win->w_group, target); + if (OPAL_UNLIKELY(NULL == proc)) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "Unknown rank %d on window %d", target, module->wid); + goto exit; + } + + target_span = opal_datatype_span((const opal_datatype_t *) target_dt, target_count, &gap); + if (0 == target_span) { + if (NULL != ompi_req) { + *ompi_req = &ompi_request_empty; + } + return OMPI_SUCCESS; + } + (void) gap; + + /* We retrieve endpoints created by the PML at init */ + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + + /* Allocate an OSC request */ + osc_req = (mca_osc_ubcl_request_t *) opal_free_list_get(&mca_osc_ubcl_component.req_free_list); + if (OPAL_UNLIKELY(NULL == osc_req)) { + ret = OMPI_ERR_OUT_OF_RESOURCE; + mca_osc_ubcl_warn(ret, "Not enough memory to allocate an OSC request"); + goto exit; + } + if (NULL != ompi_req) { + MCA_OSC_UBCL_REQUEST_INIT(osc_req, target, origin_dt, target_dt, win, true); + *ompi_req = &osc_req->ompi_req; + } else { + MCA_OSC_UBCL_REQUEST_INIT(osc_req, target, origin_dt, target_dt, win, false); + } + + /* Init UBCL MD */ + err = ubcl_memory_descriptor_init(&sbuf_md); + if (UBCL_SUCCESS != err) { + /* This should never happen: ubcl_memory_descriptor_init just assign values */ + mca_osc_ubcl_error(ubcl_error_to_ompi(err), "Failed to initialize ubcl MD"); + } + + /* If we don't need to pack we can build a contiguous */ + if (ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count)) { + span = opal_datatype_span((const opal_datatype_t *) origin_dt, origin_count, &gap); + err = ubcl_memory_descriptor_build_contiguous(((char *) origin_addr) + gap, span, &sbuf_md); + if (UBCL_SUCCESS != err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build contiguous memory descriptor for input buffer"); + } + } + + /* Always build a custom MD representation so that we have a fallback */ + opal_convertor_copy_and_prepare_for_recv(proc->super.proc_convertor, &origin_dt->super, + origin_count, origin_addr, 0, + &(osc_req->origin_convertor)); + + if (opal_convertor_on_device(&osc_req->origin_convertor)) { + opal_free_list_return(&mca_osc_ubcl_component.req_free_list, &(osc_req->super)); + mca_osc_ubcl_warn(OPAL_ERR_NOT_SUPPORTED, "GPU buffer not supported by osc/ubcl"); + ret = OPAL_ERR_NOT_SUPPORTED; + goto exit; + } + + err = ubcl_memory_descriptor_build_custom((void *) &(osc_req->origin_convertor), + osc_ubcl_datatype_pack, osc_ubcl_datatype_unpack, + osc_ubcl_datatype_mem_size, osc_ubcl_datatype_finish, + &sbuf_md); + + if (UBCL_SUCCESS != err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build custom memory descriptor for input buffer"); + } + + /* We need to build the iovec to describe the memory representation at the target */ + target_iov = NULL; + target_iov_count = 0; + target_addr = (void *) (uintptr_t) (target_disp * disp_unit); + if (ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { + target_iov_count = 1; + target_iov = (struct iovec *) malloc(target_iov_count * sizeof(struct iovec)); + + span = opal_datatype_span((const opal_datatype_t *) target_dt, target_count, &gap); + target_iov[0].iov_base = (char *) target_addr + gap; + target_iov[0].iov_len = span; + } else { + int ret = OMPI_SUCCESS; + ret = osc_ubcl_build_ddt_iov(target_addr, proc, target_count, target_dt, &target_iov, + &target_iov_count); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + MCA_OSC_UBCL_REQUEST_FINI(osc_req); + opal_free_list_return(&mca_osc_ubcl_component.req_free_list, + (opal_free_list_item_t *) osc_req); + return ret; + } + } + + err = ubcl_get(sbuf_md, target_iov, target_iov_count, endpoint->rank, module->wid, + ubcl_request_complete_cb, osc_req); + + free(target_iov); + + if (UBCL_SUCCESS != err) { + MCA_OSC_UBCL_REQUEST_FINI(osc_req); + opal_free_list_return(&mca_osc_ubcl_component.req_free_list, + (opal_free_list_item_t *) osc_req); + mca_osc_ubcl_error(ubcl_error_to_ompi(err), "Failed to send data"); + } + +exit: + return ret; +} diff --git a/ompi/mca/osc/ubcl/osc_ubcl_info.c b/ompi/mca/osc/ubcl/osc_ubcl_info.c new file mode 100644 index 00000000000..9a007d415ad --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_info.c @@ -0,0 +1,117 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi/mca/osc/ubcl/osc_ubcl_info.h" +#include "osc_ubcl_utils.h" + +static void update_same_disp_unit_info(mca_osc_ubcl_module_t *module, bool value) +{ + if (value != module->same_disp_unit){ + int my_rank; + int my_disp; + + my_rank = ompi_comm_rank(module->comm); + my_disp = osc_ubcl_get_disp_unit(module, my_rank); + + /* Disp_unit array need to be freed or allocated */ + osc_ubcl_fini_disp_unit(module); + module->same_disp_unit = value; + osc_ubcl_sync_disp_unit(module, my_disp, false); + } +} + +static const char* update_local_copy(opal_infosubscriber_t *obj, const char *key, const char *value) +{ + bool bval; + mca_osc_ubcl_module_t *module; + struct ompi_win_t *win = (struct ompi_win_t*) obj; + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + bval = opal_str_to_bool(value); + mca_osc_ubcl_log(20, "%s updated to %s", key, value); + + if(0 == strcmp(key, "no_locks")) { + module->no_locks = bval; + } else if(0 == strcmp(key, "same_disp_unit")) { + update_same_disp_unit_info(module, bval); + } + + /* Do not change the official value. We just needed to update our copy */ + return value; +} + +static bool get_win_info_bool(struct ompi_win_t *win, char *info_name) +{ + bool ret = false; + int found; + + opal_info_get_bool(win->super.s_info, info_name, &ret, &found); + return ret; +} + +int osc_ubcl_read_info(struct opal_info_t *info, struct ompi_win_t *win) +{ + mca_osc_ubcl_module_t *module; + + /* Windows inherits from opal_infosubscriber_t class. Use it to keep + * duplicated value up-to-date */ + opal_infosubscribe_subscribe(&win->super, "no_locks", "false", update_local_copy); + opal_infosubscribe_subscribe(&win->super, "same_disp_unit", "false", update_local_copy); + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + module->no_locks = get_win_info_bool(module->win, "no_locks"); + module->same_disp_unit = get_win_info_bool(module->win, "same_disp_unit"); + + return OMPI_SUCCESS; +} + +ptrdiff_t osc_ubcl_get_disp_unit(mca_osc_ubcl_module_t *module, int target) +{ + if (module->same_disp_unit) { + return module->disp_unit.uniq; + } else { + return module->disp_unit.all[target]; + } +} + +int osc_ubcl_sync_disp_unit(mca_osc_ubcl_module_t *module, ptrdiff_t disp_unit, bool need_synchro) +{ + int ret = OMPI_SUCCESS; + + if(! module->same_disp_unit) { + int comm_size = ompi_comm_size(module->comm); + int my_rank = ompi_comm_rank(module->comm); + module->disp_unit.all = malloc(comm_size * sizeof(ptrdiff_t)); + if (NULL == module->disp_unit.all) { + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + module->disp_unit.all[my_rank] = disp_unit; + ret = module->comm->c_coll->coll_allgather(&disp_unit, 1, MPI_AINT, module->disp_unit.all, + 1, MPI_AINT, module->comm, + module->comm->c_coll->coll_allgather_module); + } else if (need_synchro) { + module->disp_unit.uniq = disp_unit; + ret = module->comm->c_coll->coll_barrier(module->comm, + module->comm->c_coll->coll_barrier_module); + } + +exit: + return ret; +} + +void osc_ubcl_fini_disp_unit(mca_osc_ubcl_module_t *module) +{ + if(! module->same_disp_unit) { + free(module->disp_unit.all); + module->disp_unit.all = NULL; + } +} diff --git a/ompi/mca/osc/ubcl/osc_ubcl_info.h b/ompi/mca/osc/ubcl/osc_ubcl_info.h new file mode 100644 index 00000000000..1272d355036 --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_info.h @@ -0,0 +1,24 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_OSC_UBCL_INFO_H +#define MCA_OSC_UBCL_INFO_H + +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "opal/util/info.h" +#include "ompi/win/win.h" + +int osc_ubcl_read_info(struct opal_info_t *info, struct ompi_win_t *win); +int osc_ubcl_sync_disp_unit(mca_osc_ubcl_module_t *module, ptrdiff_t disp_unit, bool need_synchro); +ptrdiff_t osc_ubcl_get_disp_unit(mca_osc_ubcl_module_t *module, int target); +void osc_ubcl_fini_disp_unit(mca_osc_ubcl_module_t *module); + +#endif /* MCA_OSC_UBCL_INFO_H */ diff --git a/ompi/mca/osc/ubcl/osc_ubcl_put.c b/ompi/mca/osc/ubcl/osc_ubcl_put.c new file mode 100644 index 00000000000..1a3878410ec --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_put.c @@ -0,0 +1,169 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Bull eXtreme Interconnect OSC API implementation. + * + * Implementation of API defined in osc.h. To see parameters and return values + * of these functions, refer to ompi/mca/osc/osc.h. + */ + +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_info.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_request.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_utils.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_sync.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" + +int ompi_osc_ubcl_put(const void *origin_addr, size_t origin_count, + struct ompi_datatype_t *origin_dt, int target, + ptrdiff_t target_disp, size_t target_count, + struct ompi_datatype_t *target_dt, struct ompi_win_t *win) +{ + return ompi_osc_ubcl_rput(origin_addr, origin_count, origin_dt, + target, target_disp, target_count, target_dt, + win, NULL); +} + +int ompi_osc_ubcl_rput(const void *origin_addr, size_t origin_count, + struct ompi_datatype_t *origin_dt, int target, + ptrdiff_t target_disp, size_t target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + ubcl_error_t err = 0; + int ret = OMPI_SUCCESS; + ptrdiff_t disp_unit; + ptrdiff_t gap; + size_t span; + size_t target_iov_count; + struct iovec *target_iov; + void *target_addr; + mca_common_ubcl_endpoint_t *endpoint; + ubcl_memory_descriptor_t sbuf_md; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + mca_osc_ubcl_request_t *osc_req; + + disp_unit = osc_ubcl_get_disp_unit(module, target); + OPAL_OUTPUT_VERBOSE( + (50, mca_osc_ubcl_component.output, "UBCL_OSC_PUT to window %lu\n", module->wid)); + + /* Get proc */ + ompi_proc_t *proc; + proc = ompi_group_peer_lookup_existing(win->w_group, target); + if (OPAL_UNLIKELY(NULL == proc)) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "Unknown rank %d on window %d", target, module->wid); + goto exit; + } + + span = opal_datatype_span((const opal_datatype_t *) origin_dt, origin_count, &gap); + if (0 == span) { + if (NULL != ompi_req) { + *ompi_req = &ompi_request_empty; + } + return OMPI_SUCCESS; + } + + /* We retrieve endpoints created by the PML at init */ + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + if (OMPI_SUCCESS != ompi_osc_ubcl_check_access_epoch(target, win)) { + return OMPI_ERR_RMA_CONFLICT; + } + + /* Allocate an OSC request */ + osc_req = (mca_osc_ubcl_request_t *) opal_free_list_get(&mca_osc_ubcl_component.req_free_list); + if (OPAL_UNLIKELY(NULL == osc_req)) { + ret = OMPI_ERR_OUT_OF_RESOURCE; + mca_osc_ubcl_warn(ret, "Not enough memory to allocate an OSC request"); + goto exit; + } + if (NULL != ompi_req) { + MCA_OSC_UBCL_REQUEST_INIT(osc_req, target, origin_dt, target_dt, win, true); + *ompi_req = &osc_req->ompi_req; + } else { + MCA_OSC_UBCL_REQUEST_INIT(osc_req, target, origin_dt, target_dt, win, false); + } + + /* Init UBCL MD */ + err = ubcl_memory_descriptor_init(&sbuf_md); + if (UBCL_SUCCESS != err) { + /* This should never happen: ubcl_memory_descriptor_init just assign values */ + mca_osc_ubcl_error(ubcl_error_to_ompi(err), "Failed to initialize ubcl MD"); + } + + /* If we don't need to pack we can build a contiguous */ + if (ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count)) { + err = ubcl_memory_descriptor_build_contiguous(((char *) origin_addr) + gap, span, &sbuf_md); + if (UBCL_SUCCESS != err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build contiguous memory descriptor for input buffer"); + } + } + + /* Always build a custom MD representation so that we have a fallback */ + opal_convertor_copy_and_prepare_for_send(proc->super.proc_convertor, &origin_dt->super, + origin_count, origin_addr, 0, + &(osc_req->origin_convertor)); + + if (opal_convertor_on_device(&osc_req->origin_convertor)) { + opal_free_list_return(&mca_osc_ubcl_component.req_free_list, &(osc_req->super)); + mca_osc_ubcl_warn(OPAL_ERR_NOT_SUPPORTED, "GPU buffer not supported by osc/ubcl"); + ret = OPAL_ERR_NOT_SUPPORTED; + goto exit; + } + + err = ubcl_memory_descriptor_build_custom((void *) &(osc_req->origin_convertor), + osc_ubcl_datatype_pack, osc_ubcl_datatype_unpack, + osc_ubcl_datatype_mem_size, osc_ubcl_datatype_finish, + &sbuf_md); + + if (UBCL_SUCCESS != err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build custom memory descriptor for input buffer"); + } + + /* We need to build the iovec to describe the memory representation at the target */ + target_iov = NULL; + target_iov_count = 0; + target_addr = (void *) (uintptr_t) (target_disp * disp_unit); + if (ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { + target_iov_count = 1; + target_iov = (struct iovec *) malloc(target_iov_count * sizeof(struct iovec)); + + span = opal_datatype_span((const opal_datatype_t *) target_dt, target_count, &gap); + target_iov[0].iov_base = (char *) target_addr + gap; + target_iov[0].iov_len = span; + } else { + int ret = OMPI_SUCCESS; + ret = osc_ubcl_build_ddt_iov(target_addr, proc, target_count, target_dt, &target_iov, + &target_iov_count); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } + + err = ubcl_put(sbuf_md, target_iov, target_iov_count, endpoint->rank, module->wid, + ubcl_request_complete_cb, osc_req); + + free(target_iov); + + if (UBCL_SUCCESS != err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(err), "Failed to send data"); + } + +exit: + return ret; +} diff --git a/ompi/mca/osc/ubcl/osc_ubcl_request.c b/ompi/mca/osc/ubcl/osc_ubcl_request.c new file mode 100644 index 00000000000..6cd5fdcac58 --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_request.c @@ -0,0 +1,109 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_request.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_utils.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" + +#define container_of(ptr, type, member) ((type *) ((char *) (ptr) -offsetof(type, member))) + +/* Should be filtered out by MPI_Start based on request->type, but maybe not + * by MPI_Startall */ +static int osc_ubcl_request_start(size_t count, struct ompi_request_t **requests) +{ + (void) count; + (void) requests; + return MPI_ERR_REQUEST; +} + +static int osc_ubcl_request_free(struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_osc_ubcl_component.output, + "OSC/UBCL REQUEST_FINALIZE BEGIN osc_req=%p\n", (void *) request)); + mca_osc_ubcl_request_t *req; + opal_free_list_item_t * item; + + req = container_of((*request), mca_osc_ubcl_request_t, ompi_req); + item = (opal_free_list_item_t *) req; + + if (!REQUEST_COMPLETE(&(req)->ompi_req)) { + abort(); + } + + *request = MPI_REQUEST_NULL; + opal_free_list_return(&mca_osc_ubcl_component.req_free_list, item); + + return OMPI_SUCCESS; +} + +/* Cannot cancel osc requests */ +static int osc_ubcl_request_cancel(struct ompi_request_t *request, int complete) +{ + (void) request; + (void) complete; + return MPI_ERR_REQUEST; +} + +/* Called on free_list init during OBJ_CONSTRUCT */ +static void osc_ubcl_request_construct(mca_osc_ubcl_request_t *request) +{ + request->ompi_req.req_type = OMPI_REQUEST_WIN; + request->ompi_req.req_status._cancelled = 0; + request->ompi_req.req_free = osc_ubcl_request_free; + request->ompi_req.req_cancel = osc_ubcl_request_cancel; + request->ompi_req.req_start = osc_ubcl_request_start; +} + +/* callback privided to ubcl */ +void ubcl_request_complete_cb(ubcl_status_t status, void *cb_data) +{ + ompi_request_t *request; + mca_osc_ubcl_request_t *osc_req; + mca_osc_ubcl_module_t *module; + size_t segment_count; + size_t segment_acked; + + osc_req = (mca_osc_ubcl_request_t *) cb_data; + request = &osc_req->ompi_req; + module = (mca_osc_ubcl_module_t *) osc_req->win->w_osc_module; + + OPAL_OUTPUT_VERBOSE((50, mca_osc_ubcl_component.output, + "OSC/UBCL DATA TRANSFER COMPLETE mpi_req=%p\n", (void *) request)); + + mca_common_ubcl_status_to_ompi(&request->req_status, status, module->comm, -1); + if (MPI_STATUS_IGNORE != &request->req_status) { + request->req_status.MPI_ERROR = ubcl_error_to_ompi(status.status); + } + if (UBCL_SUCCESS != status.status) { + mca_osc_ubcl_error(OMPI_ERROR, "UBCL error at request completion"); + } + + segment_count = osc_req->segment_count; + segment_acked = opal_atomic_add_fetch_64((int64_t *) &osc_req->segment_ack, 1); + + if (segment_count == segment_acked) { + MCA_OSC_UBCL_REQUEST_FINI(osc_req); + ompi_request_complete(request, true); + /* Free is called once the completed request is waited/tested + * However this request comes from a non request-based call, then MPI_Wait will never be + * called so osc_ubcl_request_free must be manually called here */ + if (!osc_req->is_request_based) { + osc_ubcl_request_free(&request); + } + } +} + +OBJ_CLASS_INSTANCE(mca_osc_ubcl_request_t, + opal_free_list_item_t, + osc_ubcl_request_construct, + NULL); diff --git a/ompi/mca/osc/ubcl/osc_ubcl_request.h b/ompi/mca/osc/ubcl/osc_ubcl_request.h new file mode 100644 index 00000000000..9b744833230 --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_request.h @@ -0,0 +1,79 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_OSC_UBCL_REQUEST_H +#define MCA_OSC_UBCL_REQUEST_H + +#include +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/request/request.h" + +struct mca_osc_ubcl_request_s { + opal_free_list_item_t super; + ompi_request_t ompi_req; /**< Base request */ + struct ompi_win_t *win; + + uint64_t is_request_based : 1; + uint64_t unused : 63; + + ompi_datatype_t *origin_dt; + ompi_datatype_t *target_dt; + + opal_convertor_t origin_convertor; + + /* Non contiguous accumulate are segmented */ + size_t segment_count; + + /* Track that all segments are finished before completing the user's request */ + size_t segment_ack; +}; +typedef struct mca_osc_ubcl_request_s mca_osc_ubcl_request_t; +OBJ_CLASS_DECLARATION(mca_osc_ubcl_request_t); + +/* callback required by ubcl */ +void ubcl_request_complete_cb(ubcl_status_t status, void *cb_data); + +/** + * Generic convenient macros + */ +#define MCA_OSC_UBCL_REQUEST_INIT(req, _dst, _origin_dt, _target_dt, _win, _is_request_based) \ + do { \ + OBJ_RETAIN(_win); \ + if (NULL != _origin_dt) { \ + OMPI_DATATYPE_RETAIN(_origin_dt); \ + } \ + if (NULL != _target_dt) { \ + OMPI_DATATYPE_RETAIN(_target_dt); \ + } \ + OMPI_REQUEST_INIT(&(req)->ompi_req, false); \ + (req)->ompi_req.req_state = OMPI_REQUEST_ACTIVE; \ + (req)->origin_dt = _origin_dt; \ + (req)->target_dt = _target_dt; \ + (req)->win = _win; \ + (req)->is_request_based = _is_request_based; \ + (req)->segment_count = 1; \ + (req)->segment_ack = 0; \ + OBJ_CONSTRUCT(&((req)->origin_convertor), opal_convertor_t); \ + } while (0) + +#define MCA_OSC_UBCL_REQUEST_FINI(req) \ + do { \ + OBJ_RELEASE((req)->win); \ + if (NULL != (req)->origin_dt) { \ + OMPI_DATATYPE_RELEASE((req)->origin_dt); \ + } \ + if (NULL != (req)->target_dt) { \ + OMPI_DATATYPE_RELEASE((req)->target_dt); \ + } \ + OBJ_DESTRUCT(&((req)->origin_convertor)); \ + } while (0) + +#endif //MCA_OSC_UBCL_REQUEST_H diff --git a/ompi/mca/osc/ubcl/osc_ubcl_sync.c b/ompi/mca/osc/ubcl/osc_ubcl_sync.c new file mode 100644 index 00000000000..b47a682feda --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_sync.c @@ -0,0 +1,788 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_utils.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_sync.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" + +#define FAIL_IF_NOT_LOCKED(win, _op) \ + do { \ + if (!ompi_osc_ubcl_is_locked(win)) { \ + mca_osc_ubcl_warn(OMPI_ERR_RMA_SYNC, "Attempt %s on a non locked window", _op); \ + return OMPI_ERR_RMA_SYNC; \ + } \ + } while (0) + +static const char *osc_ubcl_sync_name(ubcl_win_sync_type_t type) +{ + switch (type) { + case UBCL_WIN_SYNC_NONE: + return "NO_SYNC"; + case UBCL_WIN_SYNC_LOCK: + return "LOCK"; + case UBCL_WIN_SYNC_LOCK_ALL: + return "LOCK_ALL"; + case UBCL_WIN_SYNC_PSCW: + return "PSCW"; + case UBCL_WIN_SYNC_FENCE: + return "FENCE"; + case UBCL_WIN_SYNC_FENCE_EPOCH: + return "FENCE_WITH_COMMUNICATIONS"; + default: + return "???"; + } +} + +static bool ompi_osc_ubcl_is_locked(struct ompi_win_t *win) +{ + mca_osc_ubcl_module_t *module; + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + return module->passive_lock_refcount || UBCL_WIN_SYNC_LOCK == module->sync_type + || UBCL_WIN_SYNC_LOCK_ALL == module->sync_type + || UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK == module->sync_type; +} + +int ompi_osc_ubcl_check_access_epoch(int target_rank, struct ompi_win_t *win) +{ + int ret; + ubcl_win_sync_type_t rank_lock_type; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + int real_rank; + + ret = OMPI_SUCCESS; + + switch (module->sync_type) { + case UBCL_WIN_SYNC_LOCK: + /* Check if there is an access epoch for this target */ + rank_lock_type = module->procs_sync_type[target_rank]; + if (UBCL_WIN_SYNC_NONE == rank_lock_type) { + ret = OMPI_ERR_RMA_SYNC; + mca_osc_ubcl_warn(ret, "Invalid epoch: target %d is not locked on window %s", + target_rank, win->w_name); + } + break; + case UBCL_WIN_SYNC_LOCK_ALL: + case UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK: + ret = OMPI_SUCCESS; + break; + case UBCL_WIN_SYNC_NONE: + ret = OMPI_ERR_RMA_SYNC; + mca_osc_ubcl_warn(ret, "Invalid epoch: no epoch started on window %s", win->w_name); + break; + case UBCL_WIN_SYNC_PSCW: + /* Check if there is an access epoch for this target */ + if (NULL == module->active_sync_access_group) { + ret = OMPI_ERR_RMA_SYNC; + mca_osc_ubcl_warn(ret, "Invalid epoch: no access group defined for " + "window %s in an active target epoch", win->w_name); + } else if (OMPI_SUCCESS != ompi_group_translate_ranks(win->w_group, 1, &target_rank, + module->active_sync_access_group, + &real_rank)) { + ret = OMPI_ERR_RMA_SYNC; + mca_osc_ubcl_warn(ret, "Invalid target %d for communications on window %s", + target_rank, win->w_name); + } + break; + case UBCL_WIN_SYNC_FENCE: + case UBCL_WIN_SYNC_FENCE_EPOCH: + module->sync_type = UBCL_WIN_SYNC_FENCE_EPOCH; + ret = OMPI_SUCCESS; + break; + default: + ret = OMPI_ERR_NOT_IMPLEMENTED; + break; + } + return ret; +} + +/* ==== FLUSH ==== */ + +static int osc_ubcl_flush_no_check(int target, struct ompi_win_t *win) +{ + int ret; + int ubcl_ret; + mca_common_ubcl_endpoint_t *endpoint; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + ret = OMPI_SUCCESS; + /* Get proc */ + ompi_proc_t *proc = ompi_group_peer_lookup_existing(win->w_group, target); + if (OPAL_UNLIKELY(NULL == proc)) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "Unknown rank %d in window %s", target, win->w_name); + goto exit; + } + + /* We retrieve endpoints created by the PML at init */ + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + ubcl_ret = ubcl_flush(endpoint->rank, module->wid); + if (UBCL_SUCCESS != ubcl_ret) { + ret = ubcl_error_to_ompi(ubcl_ret); + mca_osc_ubcl_warn(ret, "ubcl_flush returned error %d", ubcl_ret); + } +exit: + return ret; +} + +int ompi_osc_ubcl_flush(int target, struct ompi_win_t *win) +{ + FAIL_IF_NOT_LOCKED(win, "flush"); + return osc_ubcl_flush_no_check(target, win); +} + +static int osc_ubcl_flush_all_no_check(struct ompi_win_t *win) +{ + int size, ret; + size = ompi_group_size(win->w_group); + + for (int i = 0; i < size; i++) { + ret = osc_ubcl_flush_no_check(i, win); + if (OMPI_SUCCESS != ret) { + return ret; + } + } + return OMPI_SUCCESS; +} + +int ompi_osc_ubcl_flush_all(struct ompi_win_t *win) +{ + FAIL_IF_NOT_LOCKED(win, "flush_all"); + return osc_ubcl_flush_all_no_check(win); +} + +int ompi_osc_ubcl_flush_local(int target, struct ompi_win_t *win) +{ + return ompi_osc_ubcl_flush(target, win); +} + +int ompi_osc_ubcl_flush_local_all(struct ompi_win_t *win) +{ + return ompi_osc_ubcl_flush_all(win); +} + + +/* ==== LOCK ==== */ + +int ompi_osc_ubcl_lock(int lock_type, int target, int assert, struct ompi_win_t *win) +{ + ompi_proc_t *proc; + mca_common_ubcl_endpoint_t *endpoint; + ubcl_win_lock_type_t ubcl_type; + int ret; + ubcl_error_t ubcl_err; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + if (module->no_locks) { + mca_osc_ubcl_error(OMPI_ERR_RMA_SYNC, "MPI_Win_lock : window %d is no_locks=true", module->wid); + } + + OPAL_THREAD_LOCK(&module->sync_lock); + + /* check synchronization type */ + if (UBCL_WIN_SYNC_NONE != module->sync_type && UBCL_WIN_SYNC_LOCK != module->sync_type + && UBCL_WIN_SYNC_FENCE != module->sync_type) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Failed to lock window %s already in sync type %s", + win->w_name, osc_ubcl_sync_name(module->sync_type)); + goto return_locked; + } + + if (MPI_LOCK_EXCLUSIVE == lock_type) { + ubcl_type = UBCL_WIN_LOCK_EXCLUSIVE; + } else if (MPI_LOCK_SHARED == lock_type) { + ubcl_type = UBCL_WIN_LOCK_SHARED; + } else { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "MPI_Win_lock : lock type %d is unknown", lock_type); + goto return_locked; + } + + proc = ompi_group_peer_lookup_existing(win->w_group, target); + if (OPAL_UNLIKELY(NULL == proc)) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "Cannot lock target %d on window %s", target, win->w_name); + goto return_locked; + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + + /* check access epoch */ + if (UBCL_WIN_SYNC_NONE != module->procs_sync_type[target]) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Target %d is already locked on window %s", + target, win->w_name); + goto return_locked; + } + + /* As no other process will attempt to acquire this lock while we have it, + * we don't need to actually take it + */ + if (0 != (MPI_MODE_NOCHECK & assert)) { + module->procs_sync_type[target] = UBCL_WIN_SYNC_LOCK_NO_CHECK; + ret = OMPI_SUCCESS; + goto no_check; + } + + ubcl_err = ubcl_win_lock(ubcl_type, endpoint->rank, module->wid); + ret = ubcl_error_to_ompi(ubcl_err); + if (OMPI_SUCCESS != ret) { + /* Remote rank may be locked for ever: no recovery possible */ + mca_osc_ubcl_error(ret, "MPI_Win_lock failed"); + goto return_locked; + } + + module->procs_sync_type[target] = UBCL_WIN_SYNC_LOCK; +no_check: + module->sync_type = UBCL_WIN_SYNC_LOCK; + opal_atomic_fetch_add_64(&module->passive_lock_refcount, 1); + +return_locked: + OPAL_THREAD_UNLOCK(&module->sync_lock); + + return ret; +} + +int ompi_osc_ubcl_unlock(int target, struct ompi_win_t *win) +{ + ompi_proc_t *proc; + mca_common_ubcl_endpoint_t *endpoint; + int ret; + ubcl_error_t ubcl_err; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + if (module->no_locks) { + mca_osc_ubcl_error(OMPI_ERR_RMA_SYNC, "MPI_Win_unlock : window %d is no_locks=true", module->wid); + } + + OPAL_THREAD_LOCK(&module->sync_lock); + + /* check synchronization type */ + if (UBCL_WIN_SYNC_LOCK != module->sync_type + || (UBCL_WIN_SYNC_LOCK != module->procs_sync_type[target] + && UBCL_WIN_SYNC_LOCK_NO_CHECK != module->procs_sync_type[target])) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Target %d is not locked so it cannot be unlocked " + "window %s (sync type %s)", + target, win->w_name, osc_ubcl_sync_name(module->sync_type)); + goto return_locked; + } + + /* Get proc */ + proc = ompi_group_peer_lookup_existing(win->w_group, target); + if (OPAL_UNLIKELY(NULL == proc)) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "Unknown rank %d on window %s", target, win->w_name); + goto return_locked; + } + + /* check exposure epoch */ + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + + ret = osc_ubcl_flush_no_check(target, win); + if (OMPI_SUCCESS != ret) { + goto return_locked; + } + + /* We did not really take this lock, so no need to release it */ + if (UBCL_WIN_SYNC_LOCK_NO_CHECK == module->procs_sync_type[target]) { + ret = OMPI_SUCCESS; + goto no_check; + } + + ubcl_err = ubcl_win_unlock(endpoint->rank, module->wid); + ret = ubcl_error_to_ompi(ubcl_err); + if (OMPI_SUCCESS != ret) { + mca_osc_ubcl_warn(ret, "MPI_Win_unlock failed"); + goto return_locked; + } + +no_check: + if (0 == opal_atomic_sub_fetch_64(&module->passive_lock_refcount, 1)) { + module->sync_type = UBCL_WIN_SYNC_NONE; + } + module->procs_sync_type[target] = UBCL_WIN_SYNC_NONE; + +return_locked: + OPAL_THREAD_UNLOCK(&module->sync_lock); + return ret; +} + +static int get_all_ubcl_ranks(struct ompi_win_t *win, ubcl_rank_t *all_ranks) +{ + int group_size; + int ret = OMPI_SUCCESS; + + group_size = ompi_group_size(win->w_group); + + for (int i = 0; i < group_size; ++i) { + ompi_proc_t *proc; + mca_common_ubcl_endpoint_t *endpoint; + + proc = ompi_group_peer_lookup_existing(win->w_group, i); + + if (OPAL_UNLIKELY(NULL == proc)) { + mca_osc_ubcl_warn(ret, "Unknown %d-th proc on window %s", i, win->w_name); + return OMPI_ERR_BAD_PARAM; + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + all_ranks[i] = endpoint->rank; + } + return ret; +} + +/* lock_all doesn't need to check the exposure epoch because if there was another + * one started (individual lock or lock_all) then module->sync_type would be + * different from UBCL_WIN_SYNC_NONE therefore returning OMPI_ERR_RMA_CONFLICT. + * Stemming from this, unlock_all doesn't need to check the epoch either + */ +int ompi_osc_ubcl_lock_all(int assert, struct ompi_win_t *win) +{ + ubcl_rank_t *all_ranks; + int group_size, ret; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + if (module->no_locks) { + mca_osc_ubcl_error(OMPI_ERR_RMA_SYNC, "MPI_Win_lockall : window %d is no_locks=true", module->wid); + } + + /* check access epoch */ + if (UBCL_WIN_SYNC_NONE != module->sync_type && UBCL_WIN_SYNC_FENCE != module->sync_type) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Failed to lock_all window %s already in sync type %s", + win->w_name, osc_ubcl_sync_name(module->sync_type)); + return ret; + } + + if (0 != (MPI_MODE_NOCHECK & assert)) { + module->sync_type = UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK; + ret = OMPI_SUCCESS; + goto no_check; + } + + group_size = ompi_group_size(win->w_group); + all_ranks = malloc((group_size) * sizeof(ubcl_rank_t)); + ret = get_all_ubcl_ranks(win, all_ranks); + if (OMPI_SUCCESS != ret) { + goto exit_malloced; + } + + ret = ubcl_error_to_ompi(ubcl_win_lock_multiple(all_ranks, group_size, module->wid)); + if (OMPI_SUCCESS != ret) { + /* Undefined state. Nothing can be retried safely */ + mca_osc_ubcl_error(ret, "MPI_Win_lock_all failed"); + } + + opal_atomic_fetch_add_64(&module->passive_lock_refcount, group_size); + module->sync_type = UBCL_WIN_SYNC_LOCK_ALL; + +exit_malloced: + free(all_ranks); + +no_check: + return ret; +} + +int ompi_osc_ubcl_unlock_all(struct ompi_win_t *win) +{ + ubcl_rank_t *all_ranks; + int group_size, ret; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + if (module->no_locks) { + mca_osc_ubcl_error(OMPI_ERR_RMA_SYNC, "MPI_Win_unlockall : window %d is no_locks=true", module->wid); + } + + if (UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK == module->sync_type) { + osc_ubcl_flush_all_no_check(win); + module->sync_type = UBCL_WIN_SYNC_NONE; + ret = UBCL_SUCCESS; + goto no_check; + } + + /* check access epoch */ + if (UBCL_WIN_SYNC_LOCK_ALL != module->sync_type) { + return OMPI_ERR_RMA_CONFLICT; + } + + group_size = ompi_group_size(win->w_group); + all_ranks = malloc((group_size) * sizeof(ubcl_rank_t)); + ret = get_all_ubcl_ranks(win, all_ranks); + if (OMPI_SUCCESS != ret) { + goto exit_malloced; + } + + osc_ubcl_flush_all_no_check(win); + ret = ubcl_error_to_ompi(ubcl_win_unlock_multiple(all_ranks, group_size, module->wid)); + if (OMPI_SUCCESS != ret) { + /* Undefined state. Nothing can be retried safely */ + mca_osc_ubcl_error(ret, "MPI_Win_unlock_all failed"); + } + + if (0 == opal_atomic_sub_fetch_64(&module->passive_lock_refcount, group_size)) { + module->sync_type = UBCL_WIN_SYNC_NONE; + } + +exit_malloced: + free(all_ranks); + +no_check: + return ret; +} + + +/* ==== Active target Post/Start/Complete/Wait ==== */ + +int ompi_osc_ubcl_start(struct ompi_group_t *group, int assert, struct ompi_win_t *win) +{ + /* We cannot take benefit from this assertion: + * MPI_MODE_NOCHECK: As we still need to synchro the end of the epoch, + * we cannot bypass synchronization calls. + */ + (void) assert; + int ret; + ompi_proc_t *proc; + int group_size; + mca_common_ubcl_endpoint_t *endpoint; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + OPAL_THREAD_LOCK(&module->sync_lock); + + /* We should be able to create an access and an exposure epoch simultaneously */ + if (NULL != module->active_sync_access_group + || ( UBCL_WIN_SYNC_NONE != module->sync_type + && UBCL_WIN_SYNC_FENCE != module->sync_type + && UBCL_WIN_SYNC_PSCW != module->sync_type )) { + ret = OMPI_ERR_RMA_SYNC; + mca_osc_ubcl_warn(ret, "Failed to start window %s already in sync type %s", + win->w_name, osc_ubcl_sync_name(module->sync_type)); + goto return_locked; + } + + group_size = ompi_group_size(group); + for (int i = 0; i < group_size; ++i) { + ubcl_error_t ubcl_err; + + proc = ompi_group_peer_lookup_existing(group, i); + if (OPAL_UNLIKELY(NULL == proc)) { + /* Partial retries are not possible here so errors are fatal */ + mca_osc_ubcl_error(OMPI_ERROR, "Unknown %d-th rank asked to %s", i, __func__); + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + ubcl_err = ubcl_win_initiator_waits_lock(endpoint->rank, module->wid); + if (UBCL_SUCCESS != ubcl_err) { + ret = ubcl_error_to_ompi(ubcl_err); + /* Partial retries are not possible here so errors are fatal */ + mca_osc_ubcl_error(ret, + "[win %s] Start failed waiting process %d to accept the lock", + win->w_name, i); + goto return_locked; + } + } + + module->sync_type = UBCL_WIN_SYNC_PSCW; + OBJ_RETAIN(group); + module->active_sync_access_group = group; + + ret = OMPI_SUCCESS; +return_locked: + OPAL_THREAD_UNLOCK(&module->sync_lock); + return ret; +} + +int ompi_osc_ubcl_complete(struct ompi_win_t *win) +{ + ompi_proc_t *proc; + int group_size; + mca_common_ubcl_endpoint_t *endpoint; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + int ret; + + OPAL_THREAD_LOCK(&module->sync_lock); + + if (UBCL_WIN_SYNC_PSCW != module->sync_type) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Failed to complete window %s in sync type %s", + win->w_name, osc_ubcl_sync_name(module->sync_type)); + goto return_locked; + } + + if (NULL == module->active_sync_access_group) { + ret = OMPI_ERROR; + mca_osc_ubcl_warn(ret, "[win %s] no access group for which to complete " + "active target synchronization", win->w_name); + goto return_locked; + } + + ret = osc_ubcl_flush_all_no_check(win); + if (OMPI_SUCCESS != ret) { + goto return_locked; + } + + ubcl_error_t ubcl_err; + /* Call ubcl_win_sync to clean some NIC counterproductive caches */ + ubcl_err = ubcl_win_sync(module->wid); + if (UBCL_SUCCESS != ubcl_err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(ubcl_err), + "[win %s] Call to sync failed, this is not recoverable", win->w_name); + } + + group_size = ompi_group_size(module->active_sync_access_group); + for (int i = 0; i < group_size; ++i) { + + proc = ompi_group_peer_lookup_existing(module->active_sync_access_group, i); + if (OPAL_UNLIKELY(NULL == proc)) { + /* Partial retries are not possible here so errors are fatal */ + mca_osc_ubcl_error(OMPI_ERROR, "Unknown %d-th rank asked to %s", i, __func__); + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + ubcl_err = ubcl_win_initiator_releases_lock(endpoint->rank, module->wid); + if (UBCL_SUCCESS != ubcl_err) { + ret = ubcl_error_to_ompi(ubcl_err); + /* Partial retries are not possible here so errors are fatal */ + mca_osc_ubcl_error(ret, "[win %s] Complete failed while releasing lock " + "for process %d", win->w_name, i); + goto return_locked; + } + } + + /* We want to keep the window marked as in a pscw sync scheme if an exposure epoch exists */ + if (NULL == module->active_sync_exposure_group) { + module->sync_type = UBCL_WIN_SYNC_NONE; + } + OBJ_RELEASE(module->active_sync_access_group); + module->active_sync_access_group = NULL; + + ret = OMPI_SUCCESS; + +return_locked: + OPAL_THREAD_UNLOCK(&module->sync_lock); + return ret; +} + +int ompi_osc_ubcl_post(struct ompi_group_t *group, int assert, struct ompi_win_t *win) +{ + /* We cannot take benefit from these assertions: + * MPI_MODE_NOCHECK: As we still need to synchro the end of the epoch, + * we cannot bypass synchronization calls. + * MPI_MODE_NOSTORE: Window is in unified memory model, operations are not cached. + * MPI_MODE_NOPUT : Window is in unified memory model, operations are not cached. + */ + (void) assert; + int ret; + ompi_proc_t *proc; + int group_size; + mca_common_ubcl_endpoint_t *endpoint; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + ubcl_error_t ubcl_err; + + OPAL_THREAD_LOCK(&module->sync_lock); + + /* We should be able to create an access and an exposure epoch simultaneously */ + if (NULL != module->active_sync_exposure_group + || ( UBCL_WIN_SYNC_NONE != module->sync_type + && UBCL_WIN_SYNC_FENCE != module->sync_type + && UBCL_WIN_SYNC_PSCW != module->sync_type )) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Failed to post window %s already in sync type %s", + win->w_name, osc_ubcl_sync_name(module->sync_type)); + goto return_locked; + } + + /* Call ubcl_win_sync to clean some NIC counterproductive caches */ + ubcl_err = ubcl_win_sync(module->wid); + if (UBCL_SUCCESS != ubcl_err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(ubcl_err), + "[win %s] Call to sync failed, this is not recoverable", win->w_name); + } + + group_size = ompi_group_size(group); + for (int i = 0; i < group_size; ++i) { + + proc = ompi_group_peer_lookup_existing(group, i); + if (OPAL_UNLIKELY(NULL == proc)) { + /* Partial retries are not possible here so errors are fatal */ + mca_osc_ubcl_error(OMPI_ERROR, "Unknown %d-th rank asked to %s", i, __func__); + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + ubcl_err = ubcl_win_target_grants_lock(endpoint->rank, module->wid); + if (UBCL_SUCCESS != ubcl_err) { + ret = ubcl_error_to_ompi(ubcl_err); + /* Partial retries are not possible here so errors are fatal */ + mca_osc_ubcl_error(ret, "[win %s] Post failed while accepting the " + "lock from process %d", win->w_name, i); + goto return_locked; + } + } + module->sync_type = UBCL_WIN_SYNC_PSCW; + OBJ_RETAIN(group); + module->active_sync_exposure_group = group; + module->nb_rank_waited = 0; + + ret = OMPI_SUCCESS; + +return_locked: + OPAL_THREAD_UNLOCK(&module->sync_lock); + return ret; +} + +int ompi_osc_ubcl_wait(struct ompi_win_t *win) +{ + int ret; + ret = ompi_osc_ubcl_test(win, NULL); + return ret; +} + +int ompi_osc_ubcl_test(struct ompi_win_t *win, int *flag) +{ + int ret; + ompi_proc_t *proc; + int group_size; + mca_common_ubcl_endpoint_t *endpoint; + int ubcl_flag = 0; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + OPAL_THREAD_LOCK(&module->sync_lock); + if (NULL != flag) { + (*flag) = 0; + } + + if (UBCL_WIN_SYNC_PSCW != module->sync_type) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Failed to test window %s in sync type %s", + win->w_name, osc_ubcl_sync_name(module->sync_type)); + goto return_locked; + } + + if (NULL == module->active_sync_exposure_group) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "[win %s] no group to wait lock release from", + win->w_name); + goto return_locked; + } + + group_size = ompi_group_size(module->active_sync_exposure_group); + for (int i = module->nb_rank_waited; i < group_size; ++i) { + ubcl_error_t ubcl_err; + + proc = ompi_group_peer_lookup_existing(module->active_sync_exposure_group, i); + if (OPAL_UNLIKELY(NULL == proc)) { + /* Undefined state. Nothing can be retried safely */ + mca_osc_ubcl_error(OMPI_ERROR, "Unknown %d-th rank asked to %s", i, __func__); + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + + if (NULL != flag) { + ubcl_err = ubcl_win_target_tests_lock_release(endpoint->rank, module->wid, &ubcl_flag); + } else { + ubcl_err = ubcl_win_target_waits_lock_release(endpoint->rank, module->wid); + ubcl_flag = 1; + } + + if (UBCL_SUCCESS != ubcl_err) { + ret = ubcl_error_to_ompi(ubcl_err); + mca_osc_ubcl_warn(ret, "[win %s] Test failed while waiting release " + "lock from the %d-th process", win->w_name, i); + goto return_locked; + } + + /* if we didn't pass the tests_lock_release then we return + * for next test we'll start again at proc n° module->nb_rank_waited + * if it was a wait, ubcl_flag was positionned to false so it is ignored */ + if (!ubcl_flag) { + ret = OMPI_SUCCESS; + goto return_locked; + } + module->nb_rank_waited++; + } + + /* We want to keep the window marked as in a pscw sync scheme if an exposure epoch exists */ + if (NULL == module->active_sync_access_group) { + module->sync_type = UBCL_WIN_SYNC_NONE; + } + + OBJ_RELEASE(module->active_sync_exposure_group); + module->active_sync_exposure_group = NULL; + if (NULL != flag) { + (*flag) = 1; + } + + ret = OMPI_SUCCESS; +return_locked: + OPAL_THREAD_UNLOCK(&module->sync_lock); + return ret; +} + +/* ==== Fence ==== */ + +int ompi_osc_ubcl_fence(int assert, struct ompi_win_t *win) +{ + /* We cannot take benefit from these assertions: + * MPI_MODE_NOSTORE : The window is in unified memory model, no operations are cached. + * MPI_MODE_NOPUT : The window is in unified memory model, no operations are cached. + */ + int ret = OMPI_SUCCESS; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + if (UBCL_WIN_SYNC_FENCE != module->sync_type + && UBCL_WIN_SYNC_FENCE_EPOCH != module->sync_type + && UBCL_WIN_SYNC_NONE != module->sync_type) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Failed to fence window %s in sync type %s", + win->w_name, osc_ubcl_sync_name(module->sync_type)); + return ret; + } + + OPAL_THREAD_LOCK(&module->sync_lock); + + /* If the sync_type is not UBCL_WIN_SYNC_FENCE_EPOCH this should be almost a noop */ + if (0 == (MPI_MODE_NOPRECEDE & assert)) { + ret = osc_ubcl_flush_all_no_check(win); + if (OMPI_SUCCESS != ret) { + goto return_locked; + } + } + + /* There is no easy way to detect when the barrier is optional. + * The remote process may have started an epoch without us, we need to wait + * its fence to avoid concurrent access. */ + if (0 == (MPI_MODE_NOPRECEDE & assert) || 0 == (MPI_MODE_NOSUCCEED & assert)) { + ubcl_error_t ubcl_err; + /* Call ubcl_win_sync to clean some NIC counterproductive caches */ + ubcl_err = ubcl_win_sync(module->wid); + if (UBCL_SUCCESS != ubcl_err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(ubcl_err), + "[win %s] Call to sync failed, this is not recoverable", win->w_name); + } + + ret = module->comm->c_coll->coll_barrier(module->comm, + module->comm->c_coll->coll_barrier_module); + } + + module->sync_type = UBCL_WIN_SYNC_FENCE; + +return_locked: + OPAL_THREAD_UNLOCK(&module->sync_lock); + return ret; +} + +int ompi_osc_ubcl_sync(struct ompi_win_t *win) +{ + (void) win; + return OMPI_SUCCESS; +} diff --git a/ompi/mca/osc/ubcl/osc_ubcl_sync.h b/ompi/mca/osc/ubcl/osc_ubcl_sync.h new file mode 100644 index 00000000000..0a4d0149e1f --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_sync.h @@ -0,0 +1,45 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_OSC_UBCL_SYNC_H +#define MCA_OSC_UBCL_SYNC_H + +typedef enum ubcl_win_sync_type { + UBCL_WIN_SYNC_NONE, + UBCL_WIN_SYNC_LOCK, + UBCL_WIN_SYNC_LOCK_NO_CHECK, + UBCL_WIN_SYNC_LOCK_ALL, + UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK, + UBCL_WIN_SYNC_PSCW, + UBCL_WIN_SYNC_FENCE, + UBCL_WIN_SYNC_FENCE_EPOCH +} ubcl_win_sync_type_t; + +/* Component API */ +int ompi_osc_ubcl_lock(int lock_type, int target, int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_unlock(int target, struct ompi_win_t *win); +int ompi_osc_ubcl_lock_all(int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_unlock_all(struct ompi_win_t *win); + +int ompi_osc_ubcl_start(struct ompi_group_t *group, int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_complete(struct ompi_win_t *win); +int ompi_osc_ubcl_post(struct ompi_group_t *group, int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_wait(struct ompi_win_t *win); +int ompi_osc_ubcl_test(struct ompi_win_t *win, int *flag); + +int ompi_osc_ubcl_fence(int assert, struct ompi_win_t *win); + +int ompi_osc_ubcl_sync(struct ompi_win_t *win); + +/* OSC/UBCL internals */ +int ompi_osc_ubcl_check_access_epoch(int target_rank, struct ompi_win_t *win); + +#endif /* MCA_OSC_UBCL_SYNC_H */ diff --git a/ompi/mca/osc/ubcl/osc_ubcl_utils.h b/ompi/mca/osc/ubcl/osc_ubcl_utils.h new file mode 100644 index 00000000000..9c9280ceb78 --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_utils.h @@ -0,0 +1,37 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * Bull eXtreme Interconnect utilities + * + * Contains some usefull fonctions + * + */ + +#ifndef MCA_OSC_UBCL_UTILS_H +#define MCA_OSC_UBCL_UTILS_H + +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "opal/util/output.h" + +#define OSC_UBCL_COMP_NAME "OSC/UBCL" + +#define mca_osc_ubcl_log(lvl, ...) \ + opal_output_verbose(lvl, mca_osc_ubcl_component.output, __VA_ARGS__) + +#define mca_osc_ubcl_warn(err, format, ...) \ + _mca_common_ubcl_error(__FILE__, __LINE__, err, false, 5, mca_osc_ubcl_component.output, mca_osc_ubcl_component.is_init, mca_osc_ubcl_component.verbose, OSC_UBCL_COMP_NAME, format, ##__VA_ARGS__) +#define mca_osc_ubcl_error(err, format, ...) \ + _mca_common_ubcl_error(__FILE__, __LINE__, err, true, 1, mca_osc_ubcl_component.output, mca_osc_ubcl_component.is_init, mca_osc_ubcl_component.verbose, OSC_UBCL_COMP_NAME, format, ##__VA_ARGS__) +#define mca_osc_ubcl_help(...) opal_show_help("help-mpi-osc-ubcl.txt", ##__VA_ARGS__) + +#endif /*MCA_OSC_UBCL_UTILS_H */ diff --git a/ompi/mca/pml/ubcl/Makefile.am b/ompi/mca/pml/ubcl/Makefile.am new file mode 100644 index 00000000000..64033940388 --- /dev/null +++ b/ompi/mca/pml/ubcl/Makefile.am @@ -0,0 +1,52 @@ +# Copyright (c) 2019-2024 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CPPFLAGS = $(pml_ubcl_CPPFLAGS) + +EXTRA_DIST = post_configure.sh + +ubcl_sources = \ + pml_ubcl_utils.h \ + pml_ubcl_request.h \ + pml_ubcl.h \ + pml_ubcl.c \ + pml_ubcl_utils.c \ + pml_ubcl_isend.c \ + pml_ubcl_irecv.c \ + pml_ubcl_iprobe.c \ + pml_ubcl_progress.c \ + pml_ubcl_request.c \ + pml_ubcl_datatype.c \ + pml_ubcl_component.c \ + pml_ubcl_endpoint.c \ + pml_ubcl_endpoint.h + +if MCA_BUILD_ompi_pml_ubcl_DSO +component_noinst = +component_install = mca_pml_ubcl.la +else +component_noinst = libmca_pml_ubcl.la +component_install = +endif + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_pml_ubcl_la_SOURCES = $(ubcl_sources) +mca_pml_ubcl_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \ + $(pml_ubcl_LIBS) \ + $(OPAL_TOP_BUILDDIR)/opal/mca/common/ubcl/lib@OPAL_LIB_NAME@mca_common_ubcl.la \ + $(OMPI_TOP_BUILDDIR)/ompi/mca/common/ubcl/libmca_common_ubcl.la + +mca_pml_ubcl_la_LDFLAGS = -module -avoid-version $(pml_ubcl_LDFLAGS) +mca_pml_ubcl_la_CPPFLAGS = -Wextra -Wall -Werror -Wno-unused-parameter -Wno-missing-field-initializers $(pml_ubcl_CPPFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_pml_ubcl_la_SOURCES = $(ubcl_sources) +libmca_pml_ubcl_la_LIBADD = $(pml_ubcl_LIBS) +libmca_pml_ubcl_la_LDFLAGS = -module -avoid-version $(pml_ubcl_LDFLAGS) +libmca_pml_ubcl_la_CPPFLAGS = $(mca_pml_ubcl_la_CPPFLAGS) diff --git a/ompi/mca/pml/ubcl/configure.m4 b/ompi/mca/pml/ubcl/configure.m4 new file mode 100644 index 00000000000..c3159651a41 --- /dev/null +++ b/ompi/mca/pml/ubcl/configure.m4 @@ -0,0 +1,35 @@ +# +# Copyright (c) 2024 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +AC_DEFUN([MCA_ompi_pml_ubcl_POST_CONFIG], [ + AS_IF([test "$1" = "1"], [OMPI_REQUIRE_ENDPOINT_TAG([PML])]) +]) + +AC_DEFUN([MCA_ompi_pml_ubcl_CONFIG], [ + AC_CONFIG_FILES([ompi/mca/pml/ubcl/Makefile]) + + OMPI_CHECK_UBCL([pml_ubcl], + [pml_ubcl_happy="yes"], + [pml_ubcl_happy="no"]) + + AC_REQUIRE([MCA_ompi_common_ubcl_CONFIG]) + AC_REQUIRE([MCA_opal_common_ubcl_CONFIG]) + AC_REQUIRE([OPAL_CHECK_CUDA]) + AC_REQUIRE([OPAL_CHECK_CUDART]) + + AS_IF([test "$pml_ubcl_happy" = "yes"], + [$1], + [$2]) + + # substitute in the things needed to build ubcl + AC_SUBST([pml_ubcl_CPPFLAGS]) + AC_SUBST([pml_ubcl_LDFLAGS]) + AC_SUBST([pml_ubcl_LIBS]) +]) diff --git a/ompi/mca/pml/ubcl/pml_ubcl.c b/ompi/mca/pml/ubcl/pml_ubcl.c new file mode 100644 index 00000000000..730c526d347 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl.c @@ -0,0 +1,174 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2024 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl.c + * + * UBCL PML + * + * Implementation of API defined in pml.h. To see parameters and return values + * of these functions, refer to ompi/mca/pml/pml.h. + */ + +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_endpoint.h" +#include "ompi/constants.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" +#include "ompi/proc/proc.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "opal/class/opal_object.h" +#include "opal/datatype/opal_convertor.h" +#include "opal/mca/hwloc/hwloc-internal.h" +#include "opal/prefetch.h" +#include "opal/util/proc.h" +#include "ubcl_api.h" + +/** + * PML UBCL Module + * + * pml_max_contextid and pml_max_tag are computed given Portas4 module + * match_bits and platform int size + */ +mca_pml_ubcl_module_t mca_pml_ubcl_module = { + .super = { + .pml_add_procs = mca_pml_ubcl_add_procs, + .pml_del_procs = mca_pml_ubcl_del_procs, + .pml_enable = mca_pml_ubcl_enable, + .pml_progress = mca_pml_ubcl_progress, + .pml_add_comm = mca_pml_ubcl_add_comm, + .pml_del_comm = mca_pml_ubcl_del_comm, + .pml_irecv_init = mca_pml_ubcl_irecv_init, + .pml_irecv = mca_pml_ubcl_irecv, + .pml_recv = mca_pml_ubcl_recv, + .pml_isend_init = mca_pml_ubcl_isend_init, + .pml_isend = mca_pml_ubcl_isend, + .pml_send = mca_pml_ubcl_send, + .pml_iprobe = mca_pml_ubcl_iprobe, + .pml_probe = mca_pml_ubcl_probe, + .pml_start = mca_pml_ubcl_start, + .pml_improbe = mca_pml_ubcl_improbe, + .pml_mprobe = mca_pml_ubcl_mprobe, + .pml_imrecv = mca_pml_ubcl_imrecv, + .pml_mrecv = mca_pml_ubcl_mrecv, + .pml_dump = mca_pml_ubcl_dump, + .pml_max_contextid = PML_UBCL_MAX_CID, /** Comes from pml_ubcl.h */ + .pml_max_tag = PML_UBCL_MAX_TAG, /** Comes from pml_ubcl.h */ + .pml_flags = MCA_PML_BASE_FLAG_REQUIRE_WORLD, + .pml_get_transports = NULL + } +}; + +int mca_pml_ubcl_add_comm(struct ompi_communicator_t *comm) +{ + mca_pml_ubcl_comm_t *new_ubcl_comm; + ompi_group_t *comm_group; + + new_ubcl_comm = malloc(sizeof(mca_pml_ubcl_comm_t)); + if (NULL == new_ubcl_comm) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + if (OMPI_COMM_IS_INTER(comm)) { + comm_group = comm->c_remote_group; + new_ubcl_comm->size = ompi_comm_remote_size(comm); + } else { + comm_group = comm->c_local_group; + new_ubcl_comm->size = ompi_comm_size(comm); + } + + new_ubcl_comm->array = malloc(new_ubcl_comm->size * sizeof(uint64_t)); + if (NULL == new_ubcl_comm->array) { + free(new_ubcl_comm); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* Build array comm_rank -> ubcl_rank */ + for (uint32_t i = 0; i < new_ubcl_comm->size; i++) { + struct ompi_proc_t *proc; + mca_common_ubcl_endpoint_t *endpoint; + proc = ompi_group_peer_lookup(comm_group, i); + /* In OMPI 5 we sometimes get procs here that didn't go through + * 'add_procs'. We create them here to avoid any issue, 'add_procs' + * tests if an endpoint is already created so there is no issue if it's + * called later */ + endpoint = proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + if (NULL == endpoint) { + mca_pml_ubcl_add_procs(&proc, 1); + endpoint = proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + } else { + mca_pml_ubcl_endpoint_retain(proc); + } + new_ubcl_comm->array[i] = endpoint->rank; + } + + comm->c_pml_comm = new_ubcl_comm; + + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "UBCL_MODULE_ADD_COMM %s\n", ompi_comm_print_cid(comm))); + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_del_comm(struct ompi_communicator_t *comm) +{ + mca_pml_ubcl_comm_t *pml_comm; + ompi_group_t *comm_group; + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_DEL_COMM\n")); + + if (NULL == comm->c_pml_comm) { + mca_pml_ubcl_error(OMPI_ERR_BAD_PARAM, + "error: suspicious free of a communicator that PML UBCL has never allocated"); + } + + /* Important to be decrementing refcount/removing endpoints, + * that way if we create new communicators after MPI_Init we + * can free the endpoints reliably when needed */ + if (OMPI_COMM_IS_INTER(comm)) { + comm_group = comm->c_remote_group; + } else { + comm_group = comm->c_local_group; + } + pml_comm = (mca_pml_ubcl_comm_t *) comm->c_pml_comm; + + for (uint32_t i = 0; i < pml_comm->size; i++) { + struct ompi_proc_t *proc; + proc = ompi_group_peer_lookup(comm_group, i); + mca_pml_ubcl_endpoint_release(proc); + } + + free(pml_comm->array); + free(pml_comm); + + return OMPI_SUCCESS; +} + +/** + * Call for BTLs that we don't care of + */ +int mca_pml_ubcl_enable(bool enable) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_ENABLE\n")); + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_dump(struct ompi_communicator_t *comm, int verbose) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_DUMP\n")); + return OMPI_ERROR; +} + +int mca_pml_ubcl_start(size_t count, ompi_request_t **requests) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_START\n")); + + return mca_pml_ubcl_request_start(count, requests); +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl.h b/ompi/mca/pml/ubcl/pml_ubcl.h new file mode 100644 index 00000000000..78e57b5cd63 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl.h @@ -0,0 +1,197 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2024 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl.h + * + * UBCL PML + * + * For the standard PML interface, see omp/mca/pml/pml.h + * + * For now, pml/ubcl only expose one module which sole purpose is to set the API + * functions. It then uses its component all the way through. + */ + +#ifndef MCA_PML_UBCL_H +#define MCA_PML_UBCL_H + +#include "ompi/mca/pml/pml.h" +#include "opal/class/opal_free_list.h" + +#include "ompi/communicator/communicator.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/proc/proc.h" +#include "ompi/request/request.h" +#include "opal/mca/mca.h" +#include "opal/mca/threads/mutex.h" + +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_endpoint.h" + +#include + +#define container_of(ptr, type, member) ((type *) ((char *) (ptr) -offsetof(type, member))) + +#define PML_UBCL_THREAD_ONLY if (OPAL_UNLIKELY(mca_pml_ubcl_component.thread_multiple_enabled)) +#define pml_ubcl_lock(_lock) PML_UBCL_THREAD_ONLY opal_atomic_lock(_lock) +#define pml_ubcl_unlock(_lock) PML_UBCL_THREAD_ONLY opal_atomic_unlock(_lock) + +/* Because UBCL_MAX_TAG overflows if put in an int */ +#if (UBCL_MAX_TAG < INT_MAX) + #define PML_UBCL_MAX_TAG UBCL_MAX_TAG +#else /* (UBCL_MAX_TAG < INT_MAX) */ + #define PML_UBCL_MAX_TAG INT_MAX +#endif /* (UBCL_MAX_TAG < INT_MAX) */ + +/* Because UBCL_MAX_CID overflows if put into an uint32_t */ +#if (UBCL_MAX_CID < UINT32_MAX) + #define PML_UBCL_MAX_CID UBCL_MAX_CID +#else /* (UBCL_MAX_CID < INT_MAX) */ + #define PML_UBCL_MAX_CID UINT32_MAX +#endif /* (UBCL_MAX_CID < INT_MAX) */ + +/** + * Module structure + */ +struct mca_pml_ubcl_module_t { + mca_pml_base_module_t super; +}; +typedef struct mca_pml_ubcl_module_t mca_pml_ubcl_module_t; + +/** + * Component structure + */ +struct mca_pml_ubcl_component_t { + mca_pml_base_component_t super; + + /** Functionnal fields **/ + char is_init; /**< Whether we have been initialized, for proper close */ + int output; /**< Output stream */ + char thread_multiple_enabled; /**< Multithreading support */ + size_t nprocs; /**< Number of known processes */ + void **stack_addr_buffer; /**< Buffer to store stack on component error */ + int n_addr; /**< Number of void * addresses in #stack_addr_buffer*/ + + /** MCA parameters **/ + int priority; /**< Priority of the component */ + int verbose; /**< Verbosity level of the component */ + char force_intranode_bxi; /**< Whether to force intranode communication * + * via ubcl cards*/ + char force_cuda_custom_dt; /**< Wether to force custom datatype use for CUDA + * instead of using ADGE for contiguous CUDA buffers */ + char can_progress; /**< Allow PML to call opal_progress() once at the end of + * each primitive */ + char gdb_attach; /**< Allow to attach a debugger by looping indefinitly on + * this value until 0.*/ + unsigned int max_req; /**< Maximum number of requests */ + unsigned int min_req; /**< Minimum (and inititial) number of requests */ + unsigned int incr_req; /**< Increasing (and inititial) number of requests */ + unsigned int pad_req; + + char check_recv_rsend; /**< Warn if a rsend did not immediatly match a recv */ + char warn_on_truncate; /**< Warn if Recv are truncate */ + char abort_on_truncate; /**< Abort if Recv are truncate */ + char use_mpi_wildcards; /**< Activate MPI_ANY_SOURCE and MPI_ANY_TAG support */ + char accelerator_is_cuda; /**< True if the current accelerator is 'cuda' */ + + /** UBCL endpoint type capabilities **/ + ubcl_endpoint_capabilities_t endpoint_capabilities[UBCL_ENDPOINT_TYPE_SIZE]; + + opal_free_list_t pml_req_free_list; +}; +typedef struct mca_pml_ubcl_component_t mca_pml_ubcl_component_t; + +/* + * mca_pml_comm_t is an anonymous structure used in ompi_comm_t. Each pml can + * provide its own declaration of mca_pml_comm_t. + * Don't change this name. + */ +struct mca_pml_comm_t { + uint64_t *array; + uint32_t size; + uint16_t is_inter; + uint16_t pad0; +}; +typedef struct mca_pml_comm_t mca_pml_ubcl_comm_t; + +/** Sole PML module **/ +extern mca_pml_ubcl_module_t mca_pml_ubcl_module; + +/** PML UBCL component **/ +OMPI_DECLSPEC extern mca_pml_ubcl_component_t mca_pml_ubcl_component; + +/** + * Internal API + */ +void mca_pml_ubcl_isend_start(struct ompi_request_t **request); +void mca_pml_ubcl_irecv_prepare(void *buf, size_t count, ompi_datatype_t *datatype, int src, + int tag, struct ompi_communicator_t *comm, + struct ompi_request_t **request, bool persistent, bool probe, + struct ompi_message_t *message); +void mca_pml_ubcl_irecv_start(struct ompi_request_t **request); + +size_t pml_ubcl_datatype_pack(void *pack_buf, const void *usr_handle, size_t pack_size, + size_t offset); + +size_t pml_ubcl_datatype_unpack(void *usr_handle, const void *pack_buf, size_t pack_size, + size_t offset); + +size_t pml_ubcl_datatype_mem_size(const void *usr_handle, size_t offset); + +void pml_ubcl_datatype_finish(void *usr_handle); + +/** + * PML component API (see pml_ubcl_component.c) + */ +int mca_pml_ubcl_component_open(void); +int mca_pml_ubcl_component_close(void); +int mca_pml_ubcl_component_register(void); +mca_pml_base_module_t *mca_pml_ubcl_component_init(int *priority, bool enable_progress_threads, + bool enable_mpi_threads); +int mca_pml_ubcl_component_finalize(void); + +/** + * PML API (see pml_ubcl.c) + */ +int mca_pml_ubcl_add_comm(struct ompi_communicator_t *comm); +int mca_pml_ubcl_del_comm(struct ompi_communicator_t *comm); +int mca_pml_ubcl_enable(bool enable); +int mca_pml_ubcl_progress(void); +int mca_pml_ubcl_iprobe(int src, int tag, struct ompi_communicator_t *comm, int *matched, + ompi_status_public_t *status); +int mca_pml_ubcl_probe(int src, int tag, struct ompi_communicator_t *comm, + ompi_status_public_t *status); +int mca_pml_ubcl_improbe(int src, int tag, struct ompi_communicator_t *comm, int *matched, + struct ompi_message_t **message, ompi_status_public_t *status); +int mca_pml_ubcl_mprobe(int src, int tag, struct ompi_communicator_t *comm, + struct ompi_message_t **message, ompi_status_public_t *status); +int mca_pml_ubcl_isend_init(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, + int tag, mca_pml_base_send_mode_t mode, + struct ompi_communicator_t *comm, struct ompi_request_t **request); +int mca_pml_ubcl_isend(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, + mca_pml_base_send_mode_t mode, struct ompi_communicator_t *comm, + struct ompi_request_t **request); +int mca_pml_ubcl_send(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, + mca_pml_base_send_mode_t mode, struct ompi_communicator_t *comm); +int mca_pml_ubcl_irecv_init(void *buf, size_t count, ompi_datatype_t *datatype, int src, int tag, + struct ompi_communicator_t *comm, struct ompi_request_t **request); +int mca_pml_ubcl_irecv(void *buf, size_t count, ompi_datatype_t *datatype, int src, int tag, + struct ompi_communicator_t *comm, struct ompi_request_t **request); +int mca_pml_ubcl_recv(void *buf, size_t count, ompi_datatype_t *datatype, int src, int tag, + struct ompi_communicator_t *comm, ompi_status_public_t *status); +int mca_pml_ubcl_imrecv(void *buf, size_t count, ompi_datatype_t *datatype, + struct ompi_message_t **message, struct ompi_request_t **request); +int mca_pml_ubcl_mrecv(void *buf, size_t count, ompi_datatype_t *datatype, + struct ompi_message_t **message, ompi_status_public_t *status); +int mca_pml_ubcl_dump(struct ompi_communicator_t *comm, int verbose); +int mca_pml_ubcl_start(size_t count, ompi_request_t **requests); +int mca_pml_ubcl_ft_event(int state); + +#endif /* MCA_PML_UBCL_H */ diff --git a/ompi/mca/pml/ubcl/pml_ubcl_component.c b/ompi/mca/pml/ubcl/pml_ubcl_component.c new file mode 100644 index 00000000000..40eef2c9291 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_component.c @@ -0,0 +1,288 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2024 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_component.c + * + * UBCL PML component implementation + * + * Functions parameters and return values defined in ompi/mca/pml/pml.h. + */ + +#include "opal/include/opal_config.h" + +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_endpoint.h" +#include "opal/mca/btl/base/base.h" +#include "opal/mca/accelerator/base/base.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "opal/prefetch.h" +#include "opal/util/proc.h" + +#include + +/** + * PML UBCL Component + */ +mca_pml_ubcl_component_t mca_pml_ubcl_component = { + { + .pmlm_version = { + MCA_PML_BASE_VERSION_2_1_0, + + .mca_component_name = "ubcl", + .mca_component_major_version = OMPI_MAJOR_VERSION, + .mca_component_minor_version = OMPI_MINOR_VERSION, + .mca_component_release_version = OMPI_RELEASE_VERSION, + .mca_open_component = mca_pml_ubcl_component_open, + .mca_close_component = mca_pml_ubcl_component_close, + .mca_register_component_params = mca_pml_ubcl_component_register + }, + .pmlm_data = { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE + }, + + .pmlm_init = mca_pml_ubcl_component_init, + .pmlm_finalize = mca_pml_ubcl_component_finalize, + }, + + .is_init = 0, + .accelerator_is_cuda = false, + .nprocs = 0, +}; + +/** + * Open opal output, 0-initialize some parameters and forward to communication + * modules + */ +int mca_pml_ubcl_component_open(void) +{ + /* Open output stream */ + if (0 < mca_pml_ubcl_component.verbose || mca_pml_ubcl_component.gdb_attach) { + mca_pml_ubcl_component.output = opal_output_open(NULL); + int verbose = mca_pml_ubcl_component.verbose > 0 ? mca_pml_ubcl_component.verbose : 1; + opal_output_set_verbosity(mca_pml_ubcl_component.output, verbose); + } else { + mca_pml_ubcl_component.output = -1; + } + + /* If MCA param set, wait until gdb_attach is set to 0 from outside */ + if (mca_pml_ubcl_component.gdb_attach) { + opal_output_verbose(1, mca_pml_ubcl_component.output, + "set mca_pml_ubcl_component.gdb_attach = 0\n"); + while (mca_pml_ubcl_component.gdb_attach) { + sleep(1); + }; + } + + return OMPI_SUCCESS; +} + +/** + * Close communication modules and opal output + */ +int mca_pml_ubcl_component_close(void) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_COMPONENT_CLOSE\n")); + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_component_register(void) +{ + mca_base_component_t *component = &mca_pml_ubcl_component.super.pmlm_version; + + mca_pml_ubcl_component.verbose = 0; + (void) mca_base_component_var_register(component, "verbose", "Verbosity level of the pml/ubcl.", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.verbose); + + mca_pml_ubcl_component.priority = 90; + (void) mca_base_component_var_register(component, "priority", + "Priority of the pml/ubcl component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.priority); + + mca_pml_ubcl_component.force_intranode_bxi = false; + (void) mca_base_component_var_register(component, "force_intranode_bxi", + "Whether to force intranode communication to go through " + "BXI network instead of shared memory.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.force_intranode_bxi); + + mca_pml_ubcl_component.force_cuda_custom_dt = false; + (void) mca_base_component_var_register(component, "force_cuda_custom_dt", + "Force the pml/ubcl to use custom datatype to pack/unpack cuda " + "buffers. This prevents the use of ADGE by UBCL", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.force_cuda_custom_dt); + + mca_pml_ubcl_component.can_progress = false; + (void) mca_base_component_var_register( + component, "can_progress", + "Allow PML to call opal_progress() once at the end of each primitive.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.can_progress); + + mca_pml_ubcl_component.warn_on_truncate = true; + (void) mca_base_component_var_register( + component, "warn_on_truncate", + "Allow PML to print warning messages whenever a truncation error is detected", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.warn_on_truncate); + + mca_pml_ubcl_component.abort_on_truncate = true; + (void) mca_base_component_var_register( + component, "abort_on_truncate", + "Allow PML to print error and abort in case of MPI_ERR_TRUNCATE", MCA_BASE_VAR_TYPE_BOOL, + NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.abort_on_truncate); + + mca_pml_ubcl_component.use_mpi_wildcards = true; + (void) mca_base_component_var_register( + component, "use_mpi_wildcards", + "MPI_ANY_SOURCE or MPI_ANY_TAG are used. For better performance this should be disabled.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.use_mpi_wildcards); + + mca_pml_ubcl_component.gdb_attach = false; + (void) mca_base_component_var_register( + component, "gdb_attach", + "Allow to attach a debugger by looping indefinitly on this value until 0.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.gdb_attach); + + + mca_pml_ubcl_component.max_req = 32768; + (void) mca_base_component_var_register(component, "max_req", + "Maximum number of requests allocated. (0 means infinite)", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.max_req); + + mca_pml_ubcl_component.min_req = 1024; + (void) mca_base_component_var_register(component, "min_req", + "Minimum (and initial) number of requests allocated.", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.min_req); + + mca_pml_ubcl_component.incr_req = 1024; + (void) mca_base_component_var_register(component, "incr_req", + "Increasing number of requests allocated.", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.incr_req); + + mca_common_ubcl_register_mca(); + + return OMPI_SUCCESS; +} + +static void mca_pml_ubcl_check_cuda_accelerator() +{ + const char* cuda_component_name = "cuda"; + const char* selected_component_name = opal_accelerator_base_selected_component.base_version.mca_component_name; + + /* Check if we are currently using accelerator cuda */ + /* Only one single accelerator can be selected/active. Knowing if it's the + * cuda accelerator let us know if our device buffers are cuda or not */ + if (0 == strcmp(cuda_component_name, selected_component_name)) { + mca_pml_ubcl_component.accelerator_is_cuda = true; + } +} + +/** + * Initialize parameters and forward to communication modules + */ +mca_pml_base_module_t *mca_pml_ubcl_component_init(int *priority, bool enable_progress_threads, + bool enable_mpi_threads) +{ + int err; + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_COMPONENT_INIT\n")); + + /* Register thread level */ + mca_pml_ubcl_component.thread_multiple_enabled = enable_progress_threads || enable_mpi_threads; + + if (OPAL_SUCCESS != mca_common_ubcl_init()) { + mca_pml_ubcl_warn(OMPI_ERR_NOT_AVAILABLE, "common_ubcl could not load UBCL library\n"); + return NULL; + } + + OBJ_CONSTRUCT(&mca_pml_ubcl_component.pml_req_free_list, opal_free_list_t); + err = opal_free_list_init (&mca_pml_ubcl_component.pml_req_free_list, + sizeof(mca_pml_ubcl_request_t), + opal_cache_line_size, + OBJ_CLASS(mca_pml_ubcl_request_t), + 0, opal_cache_line_size, + mca_pml_ubcl_component.min_req, + mca_pml_ubcl_component.max_req, + mca_pml_ubcl_component.incr_req, + NULL, 0, NULL, NULL, NULL); + + if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) { + mca_pml_ubcl_warn(OMPI_ERR_OUT_OF_RESOURCE, "Not enough memory (%d)", err); + return NULL; + } + + /* Initialize UBCL */ + if (UBCL_SUCCESS != ubcl_init(mca_pml_ubcl_component.thread_multiple_enabled)) { + return NULL; + } + + err = mca_pml_ubcl_create_local_endpoint(); + if (OMPI_SUCCESS != err) { + return NULL; + } + mca_pml_ubcl_check_cuda_accelerator(); + + /* Mark as initialized, set priority and return */ + mca_pml_ubcl_component.is_init = 1; + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "INITIATION DONE\n")); + *priority = mca_pml_ubcl_component.priority; + return &mca_pml_ubcl_module.super; +} + +/** + * Finalize parameters and forward to communication modules + */ +int mca_pml_ubcl_component_finalize(void) +{ + int ompi_ret; + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "ubcl_COMPONENT_FINALIZE")); + + if (0 == mca_pml_ubcl_component.is_init) { + return OMPI_SUCCESS; + } + + ompi_ret = mca_pml_ubcl_free_local_endpoints(); + if (OMPI_SUCCESS != ompi_ret) { + return ompi_ret; + } + + /* Finalize UBCL */ + if (UBCL_SUCCESS != ubcl_fini()) { + return OMPI_ERROR; + } + + OBJ_DESTRUCT(&mca_pml_ubcl_component.pml_req_free_list); + + if (OPAL_SUCCESS != mca_common_ubcl_fini()) { + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl_datatype.c b/ompi/mca/pml/ubcl/pml_ubcl_datatype.c new file mode 100644 index 00000000000..cd3ebb32fa5 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_datatype.c @@ -0,0 +1,89 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_datatype.c + * + * PML/UBCL datatype and convertor related functions + * + */ + +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" + +size_t pml_ubcl_datatype_pack(void *pack_buf, const void *usr_handle, size_t pack_size, + size_t offset) +{ + opal_convertor_t *convertor = (opal_convertor_t *) usr_handle; + + /* Set input data size and start pointer. */ + uint32_t iov_count = 1; + int ret = 0; + struct iovec iov; + iov.iov_len = pack_size; + iov.iov_base = (IOVBASE_TYPE *) pack_buf; + + opal_convertor_set_position(convertor, &offset); + + /* Pack data from converter to iov */ + ret = opal_convertor_pack(convertor, &iov, &iov_count, &pack_size); + if (-1 == ret) { + mca_pml_ubcl_error(ret, "opal_convertor_unpack failed\n"); + } + + return pack_size; +} + +size_t pml_ubcl_datatype_unpack(void *usr_handle, const void *pack_buf, size_t pack_size, + size_t offset) +{ + opal_convertor_t *convertor = (opal_convertor_t *) usr_handle; + + /* Set input data size and start pointer. */ + uint32_t iov_count = 1; + int ret = 0; + struct iovec iov; + iov.iov_len = pack_size; + iov.iov_base = (IOVBASE_TYPE *) pack_buf; + + opal_convertor_set_position(convertor, &offset); + + /* Pack data from converter to iov */ + ret = opal_convertor_unpack(convertor, &iov, &iov_count, &pack_size); + if (-1 == ret) { + mca_pml_ubcl_error(ret, "opal_convertor_unpack failed\n"); + } + + return pack_size; +} + +size_t pml_ubcl_datatype_mem_size(const void *usr_handle, size_t offset) +{ + opal_convertor_t *convertor = (opal_convertor_t *) usr_handle; + size_t size = 0; + + opal_datatype_type_size(convertor->pDesc, &size); + + if (offset > size * convertor->count) { + return 0; + } + + return size * convertor->count - offset; +} + +void pml_ubcl_datatype_finish(void *usr_handle) +{ + /* + * Does nothing + */ + + return; +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c new file mode 100644 index 00000000000..04e29babed9 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c @@ -0,0 +1,418 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_endpoint.c + * + * UBCL PML + * + * Contains functions related to ubcl endpoints + */ + +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/constants.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" +#include "ompi/proc/proc.h" +#include "opal/class/opal_object.h" +#include "opal/datatype/opal_convertor.h" +#include "opal/mca/hwloc/hwloc-internal.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "opal/prefetch.h" +#include "opal/util/proc.h" +#include "ubcl_api.h" + +/* UBCL rank is on 61 bits, ompi jobid is 32bits, vpid must be truncated to 29bits */ +#define PML_UBCL_VPID_MAX (((1 << 29) - 1)) /* We need 3 bits for UBCL rank */ +#define PML_UBCL_JOBID_MAX (OPAL_JOBID_MAX) + +static void mca_pml_ubcl_forge_modex_key(char *keyname, size_t size, const int type) +{ + int ret; + + switch (type) { + case UBCL_ENDPOINT_TYPE_BXI: + ret = snprintf(keyname, size - 1, "OMPI_UBCL_BXI_ID"); + break; + case UBCL_ENDPOINT_TYPE_SHMEM: + ret = snprintf(keyname, size - 1, "OMPI_UBCL_SHM_ID"); + break; + /* SELF endpoints don't need to forge modex keys */ + case UBCL_ENDPOINT_TYPE_SELF: + default: + ret = 0; + } + + if (0 >= ret || ((size_t) ret) > size - 1) { + mca_pml_ubcl_error(OMPI_ERROR, "Failed to forge modex keyname"); + } + + /* paranoiac */ + keyname[size - 1] = '\0'; +} + +static uint64_t mca_pml_forge_rank(ompi_proc_t *proc) +{ + uint64_t jobid, rank; + + if (ompi_proc_is_sentinel(proc)) { + mca_pml_ubcl_error(OMPI_ERROR, + "PML/UBCL proc sentinel are not supported"); + return 0; + } + + jobid = proc->super.proc_name.jobid; + rank = proc->super.proc_name.vpid; + + if (rank > (uint32_t) PML_UBCL_VPID_MAX) { + mca_pml_ubcl_error(OMPI_ERROR, + "PML/UBCL RANK failed: vpid to high (%d)", rank); + } + + return (rank | (jobid << 29)); +} + +/** + * Init time: init transports and commit ubcl handles to pmix + */ + +static int mca_pml_ubcl_endpoint_modex_put(const int type, void *endpoint_h, size_t size) +{ + int ret; + char keyname[256]; + + mca_pml_ubcl_forge_modex_key(keyname, sizeof(keyname), type); + OPAL_MODEX_SEND_STRING(ret, PMIX_GLOBAL, keyname, endpoint_h, size); + if (0 > ret) { + mca_pml_ubcl_error(OMPI_ERROR, "Failed to modex send string: %s (%d)", + opal_strerror(ret), ret); + } + + return OMPI_SUCCESS; +} + +static int mca_pml_ubcl_export_local_endpoint_handle(const int type) +{ + int err; + uint64_t remote_rank_u64; + char endpoint_h[UBCL_HANDLE_SIZE]; + const size_t size = sizeof(endpoint_h); + + /* dummy valued for ANY_RANK */ + remote_rank_u64 = UBCL_ANY_RANK; + + err = ubcl_export_local_endpoint_handle(type, endpoint_h, &remote_rank_u64); + if (UBCL_SUCCESS != err) { + return OMPI_ERROR; + } + + mca_pml_ubcl_endpoint_modex_put(type, (void *) endpoint_h, size); + + /* We were just interested in the handle. + * The actual recv rank will be allocated during add_procs calls */ + err = ubcl_close_local_endpoint_channel(type, remote_rank_u64); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_warn(OMPI_ERROR, + "PML/UBCL failed to clean local endpoint (very unlikely error)." + " For safety reason PML will be disabled."); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_create_local_endpoint(void) +{ + int type; + ubcl_error_t err; + int ompi_error; + + type = UBCL_ENDPOINT_TYPE_SELF; + err = ubcl_create_local_endpoint(type); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(OMPI_ERROR, "Failed ubcl_create_local_endpoint %d (%d)", type, err); + } + + /* UBCL_ENDPOINT_SHM */ + if (!mca_pml_ubcl_component.force_intranode_bxi) { + type = UBCL_ENDPOINT_TYPE_SHMEM; + err = ubcl_create_local_endpoint(type); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(OMPI_ERROR, "Failed ubcl_create_local_endpoint %d (%d)", type, err); + } + ompi_error = mca_pml_ubcl_export_local_endpoint_handle(type); + if (OMPI_SUCCESS != ompi_error) { + return ompi_error; + } + } + + type = UBCL_ENDPOINT_TYPE_BXI; + err = ubcl_create_local_endpoint(type); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(OMPI_ERROR, "Failed ubcl_create_local_endpoint %d (%d)", type, err); + } + ompi_error = mca_pml_ubcl_export_local_endpoint_handle(type); + if (OMPI_SUCCESS != ompi_error) { + return ompi_error; + } + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_free_local_endpoints() +{ + int ret; + /* Finalize BXI */ + ret = ubcl_free_local_endpoint(UBCL_ENDPOINT_TYPE_BXI); + if (UBCL_SUCCESS != ret) { + return OMPI_ERROR; + } + if (!mca_pml_ubcl_component.force_intranode_bxi) { + ret = ubcl_free_local_endpoint(UBCL_ENDPOINT_TYPE_SHMEM); + if (UBCL_SUCCESS != ret) { + return OMPI_ERROR; + } + } + ret = ubcl_free_local_endpoint(UBCL_ENDPOINT_TYPE_SELF); + if (UBCL_SUCCESS != ret) { + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} +/** + * Add_proce time: create send and recv endpoint for each peer + */ + +static int mca_pml_ubcl_recv_endpoint_modex_get(ompi_proc_t *proc, const int type, + endp_handle_t endpoint_h, size_t size) +{ + char keyname[256]; + size_t received_size; + void *received_buffer; + int ret; + + received_size = 0; + received_buffer = NULL; + + mca_pml_ubcl_forge_modex_key(keyname, sizeof(keyname), type); + OPAL_MODEX_RECV_STRING(ret, keyname, &proc->super.proc_name, + (void**) &received_buffer, &received_size); + if (0 > ret) { + mca_pml_ubcl_error(OMPI_ERROR, "Failed to modex recv string: %s (%d)", + opal_strerror(ret), ret); + } + + if (received_size != size) { + mca_pml_ubcl_error(OMPI_ERROR, "Modex value is truncated (expected: %zu, receiced: %zu)", + size, received_size); + } + + memcpy(endpoint_h, received_buffer, size); + + free(received_buffer); + + return OMPI_SUCCESS; +} + +static int mca_pml_ubcl_create_send_endpoint(ompi_proc_t *proc, size_t remote_rank, int type) +{ + ubcl_error_t err; + char endpoint_h[UBCL_HANDLE_SIZE]; + uint64_t ubcl_rank; + ompi_proc_t *self; + + self = ompi_proc_local(); + ubcl_rank = mca_pml_forge_rank(self); + + mca_pml_ubcl_recv_endpoint_modex_get(proc, type, (endp_handle_t) endpoint_h, sizeof(endpoint_h)); + err = ubcl_create_remote_endpoint(ubcl_rank, remote_rank, type, (endp_handle_t) endpoint_h); + + if (UBCL_SUCCESS != err) { + return OMPI_ERROR; + } + + ubcl_get_endpoint_type_capabilities(type, &mca_pml_ubcl_component.endpoint_capabilities[type]); + + return OMPI_SUCCESS; +} + +static int mca_pml_ubcl_create_recv_endpoint(uint64_t sender_rank, const int type) +{ + ubcl_error_t err; + uint64_t remote_rank_u64; + endp_handle_t endpoint_h[UBCL_HANDLE_SIZE]; + + remote_rank_u64 = sender_rank; + + err = ubcl_export_local_endpoint_handle(type, endpoint_h, &remote_rank_u64); + if (UBCL_SUCCESS != err) { + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +static int mca_pml_ubcl_create_self_endpoints(uint64_t remote_rank) +{ + ubcl_error_t err; + int type = UBCL_ENDPOINT_TYPE_SELF; + char endpoint_h[UBCL_HANDLE_SIZE]; + uint64_t my_rank = remote_rank; + + err = ubcl_export_local_endpoint_handle(type, endpoint_h, &my_rank); + if (UBCL_SUCCESS != err) { + return OMPI_ERROR; + } + err = ubcl_create_remote_endpoint(my_rank, my_rank, type, endpoint_h); + if (UBCL_SUCCESS != err) { + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +static int get_endpoint_type(ompi_proc_t *proc) +{ + if (ompi_proc_local() == proc) { + return UBCL_ENDPOINT_TYPE_SELF; + } + + /* Known limitation: proc_flags are invalid when jobid is different */ + if (proc->super.proc_name.jobid == ompi_proc_local()->super.proc_name.jobid + && OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags) + && !mca_pml_ubcl_component.force_intranode_bxi) { + return UBCL_ENDPOINT_TYPE_SHMEM; + } else { + return UBCL_ENDPOINT_TYPE_BXI; + } +} + +void mca_pml_ubcl_endpoint_retain(ompi_proc_t *proc) +{ + mca_common_ubcl_endpoint_t *endpoint = NULL; + assert(NULL != proc); + + endpoint = (proc)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + assert(NULL != endpoint); + + opal_atomic_fetch_add_32(&endpoint->refcount, 1); + mca_pml_ubcl_component.nprocs++; + OBJ_RETAIN(proc); +} + +int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc) +{ + int err = OMPI_SUCCESS; + mca_common_ubcl_endpoint_t *new_endpoint; + + new_endpoint = malloc(sizeof(mca_common_ubcl_endpoint_t)); + if (NULL == new_endpoint) { + mca_pml_ubcl_error(OMPI_ERR_OUT_OF_RESOURCE, + "PML/UBCL BXI EP Malloc: not enough memory"); + } + + new_endpoint->refcount = 0; //we increment it to 1 in endpoint_retain + new_endpoint->rank = mca_pml_forge_rank(proc); + new_endpoint->type = get_endpoint_type(proc); + + if (UBCL_ENDPOINT_TYPE_SELF == new_endpoint->type) { + err = mca_pml_ubcl_create_self_endpoints((uint64_t) new_endpoint->rank); + goto end; + } + + err = mca_pml_ubcl_create_recv_endpoint(new_endpoint->rank, new_endpoint->type); + if (OMPI_SUCCESS != err) { + mca_pml_ubcl_error(err, "Failed to create recv endpoint for rank %zu\n", + new_endpoint->rank); + } + + err = mca_pml_ubcl_create_send_endpoint(proc, new_endpoint->rank, new_endpoint->type); + if (OMPI_SUCCESS != err) { + mca_pml_ubcl_error(err, "Failed to create send endpoint for rank %zu\n", + new_endpoint->rank); + } + +end: + (proc)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = new_endpoint; + mca_pml_ubcl_endpoint_retain(proc); + + return err; +} + +int mca_pml_ubcl_add_procs(ompi_proc_t **procs, size_t nprocs) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_ADD_PROCS\n")); + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL ADD PROCS: %lu to add", nprocs)); + + /* Initialize all endpoint with remote rank */ + for (size_t i = 0; i < nprocs; i++) { + /* Let's not create endpoints or increment refcount multiple times */ + if (NULL == procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]) { + int ret = mca_pml_ubcl_create_endpoints(procs[i]); + if (OMPI_SUCCESS != ret) { + mca_pml_ubcl_error(ret, "Failed mca_ubcl_create_remote_endpoint"); + } + } + } + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "PML/UBCL ADD_PROCS called")); + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_endpoint_release(ompi_proc_t *proc) +{ + uint32_t endpoint_refcount; + ubcl_error_t ret = UBCL_SUCCESS; + int ompi_error = OMPI_SUCCESS; + mca_common_ubcl_endpoint_t *endpoint = NULL; + assert(NULL != proc); + + endpoint = (proc)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + assert(NULL != endpoint); + + endpoint_refcount = opal_atomic_sub_fetch_32(&endpoint->refcount, 1); + if (0 == endpoint_refcount) { + ret = ubcl_free_remote_endpoint(endpoint->rank); + if (UBCL_SUCCESS != ret) { + ompi_error = ubcl_error_to_ompi(ret); + mca_pml_ubcl_warn(ompi_error, "PML/UBCL failed to free remote endpoint"); + } + ret = ubcl_close_local_endpoint_channel(endpoint->type, endpoint->rank); + if (UBCL_SUCCESS != ret) { + ompi_error = ubcl_error_to_ompi(ret); + mca_pml_ubcl_warn(ompi_error, "PML/UBCL failed to close local endpoint channel"); + } + free(endpoint); + proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = NULL; + mca_pml_ubcl_component.nprocs -= 1; + OBJ_RELEASE(proc); + } + + return ompi_error; +} + +int mca_pml_ubcl_del_procs(ompi_proc_t **procs, size_t nprocs) +{ + int ret = OMPI_SUCCESS; + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_DEL_PROCS\n")); + + for (uint32_t i = 0; i < nprocs; i++) { + if (OMPI_SUCCESS != mca_pml_ubcl_endpoint_release(procs[i])) { + ret = OMPI_ERROR; + } + } + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "PML/UBCL DEL_PROCS called")); + + return ret; +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl_endpoint.h b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.h new file mode 100644 index 00000000000..0d0a44879ec --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.h @@ -0,0 +1,18 @@ +#ifndef PML_UBCL_ENDPOINT_INCLUDE_H +#define PML_UBCL_ENDPOINT_INCLUDE_H + +#include +#include "opal/util/proc.h" +/** * Endpoint structure */ +#include "opal/mca/common/ubcl/common_ubcl.h" + +/* endpoint functions */ + +int mca_pml_ubcl_create_local_endpoint(void); +int mca_pml_ubcl_free_local_endpoints(void); +int mca_pml_ubcl_endpoint_release(ompi_proc_t *proc); +void mca_pml_ubcl_endpoint_retain(ompi_proc_t *proc); +int mca_pml_ubcl_add_procs(ompi_proc_t **procs, size_t nprocs); +int mca_pml_ubcl_del_procs(ompi_proc_t **procs, size_t nprocs); + +#endif /* #ifndef PML_UBCL_ENDPOINT_INCLUDE_H */ diff --git a/ompi/mca/pml/ubcl/pml_ubcl_iprobe.c b/ompi/mca/pml/ubcl/pml_ubcl_iprobe.c new file mode 100644 index 00000000000..6b6dbad0cee --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_iprobe.c @@ -0,0 +1,129 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_iprobe.c + * + * UBCL PML iprobe related functions + * + */ + +#include "ompi/constants.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/message/message.h" +#include "ompi/proc/proc.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ubcl_api.h" + +int mca_pml_ubcl_iprobe(int src, int tag, struct ompi_communicator_t *comm, + int *matched, ompi_status_public_t *status) +{ + OPAL_OUTPUT_VERBOSE((75, mca_pml_ubcl_component.output, + "UBCL_MODULE_IPROBE\n")); + ubcl_status_t ubcl_status; + uint64_t cid; + uint64_t rank; + + if (OMPI_ANY_SOURCE == src) { + rank = UBCL_ANY_SOURCE; + } else { + ompi_proc_t *proc = ompi_comm_peer_lookup(comm, src); + mca_common_ubcl_endpoint_t *endpoint = NULL; + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + rank = endpoint->rank; + } + + cid = ompi_comm_get_local_cid(comm); + ubcl_cid_t ubcl_cid= mca_pml_ubcl_compute_ubcl_cid(tag, cid); + + /* Call the UBCL api for iprobe */ + ubcl_iprobe(rank, tag, ubcl_cid, matched, &ubcl_status); + if (*matched) { + mca_common_ubcl_status_to_ompi(status, ubcl_status, comm, src); + } + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_probe(int src, int tag, struct ompi_communicator_t *comm, + ompi_status_public_t *status) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_PROBE\n")); + int match = 0; + + /* Loop over pml iprobe */ + while (!match) { + mca_pml_ubcl_iprobe(src, tag, comm, &match, status); + } + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_improbe(int src, int tag, struct ompi_communicator_t *comm, + int *matched, struct ompi_message_t **message, + ompi_status_public_t *status) +{ + OPAL_OUTPUT_VERBOSE((75, mca_pml_ubcl_component.output, + "UBCL_MODULE_IMPROBE\n")); + ubcl_status_t ubcl_status; + uint64_t rank; + uint64_t cid; + if (OMPI_ANY_SOURCE == src) { + rank = UBCL_ANY_SOURCE; + } else { + ompi_proc_t *proc = ompi_comm_peer_lookup(comm, src); + mca_common_ubcl_endpoint_t *endpoint = NULL; + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + rank = endpoint->rank; + } + + cid = ompi_comm_get_local_cid(comm); + ubcl_cid_t ubcl_cid = mca_pml_ubcl_compute_ubcl_cid(tag, cid); + + ubcl_message_t *ubcl_message; + + /* Call the UBCL api for improbe */ + ubcl_improbe(rank, tag, ubcl_cid, matched, &ubcl_message, &ubcl_status); + if (*matched) { + mca_common_ubcl_status_to_ompi(status, ubcl_status, comm, src); + *message = ompi_message_alloc(); + if (message == NULL) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + (*message)->req_ptr = ubcl_message; + (*message)->comm = comm; + (*message)->peer = mca_common_ubcl_get_mpi_rank(src, comm, ubcl_status.remote); + (*message)->count = ubcl_status.size; + } + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_mprobe(int src, int tag, struct ompi_communicator_t *comm, + struct ompi_message_t **message, + ompi_status_public_t *status) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_MPROBE\n")); + int match = 0; + + /* Loop over pml improbe */ + while (!match) { + mca_pml_ubcl_improbe(src, tag, comm, &match, message, status); + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl_irecv.c b/ompi/mca/pml/ubcl/pml_ubcl_irecv.c new file mode 100644 index 00000000000..9ea74d9e428 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_irecv.c @@ -0,0 +1,292 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_irecv.c + * + * UBCL PML irecv related functions + * + * Functions parameters and return values defined in pml.h. + */ + +#include "opal/mca/common/ubcl/common_ubcl.h" + +#include "ompi/constants.h" +#include "ompi/mca/pml/pml_constants.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/message/message.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/proc/proc.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ubcl_api.h" + +/** + * Prepare a request for reception. + */ +void mca_pml_ubcl_irecv_prepare(void *buf, size_t count, + ompi_datatype_t *datatype, int src, int tag, + struct ompi_communicator_t *comm, + struct ompi_request_t **request, + bool persistent, bool probe, + struct ompi_message_t *message) +{ + ompi_proc_t *proc; + mca_pml_ubcl_request_t *req; + +#if defined(OPAL_ENABLE_DEBUG) && OPAL_ENABLE_DEBUG + if (probe) { + OPAL_OUTPUT_VERBOSE((75, mca_pml_ubcl_component.output, + "UBCL_MODULE_IRECV_PREPARE\n")); + } else { + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_IRECV_PREPARE\n")); + } +#endif /* OPAL_ENABLE_DEBUG */ + + /* Get proc */ + if (OMPI_ANY_SOURCE != src) { + proc = ompi_comm_peer_lookup(comm, src); + if (OPAL_UNLIKELY(NULL == proc)) { + mca_pml_ubcl_error(OMPI_ERROR, "Unknown proc"); + } + } else { + proc = NULL; + } + + /* Allocate request and activate it */ + req = (mca_pml_ubcl_request_t *) opal_free_list_get(&mca_pml_ubcl_component.pml_req_free_list); + if (OPAL_UNLIKELY(NULL == req)) { + mca_pml_ubcl_error(OMPI_ERR_OUT_OF_RESOURCE, + "Not enough memory to allocate a recv request"); + } + + MCA_PML_UBCL_RECV_REQUEST_INIT(req, buf, count, datatype, src, tag, comm, + proc, persistent, probe, message); + + /* Set user request */ + *request = &req->ompi_req; +} + +/** + * Actually start a recv request. + */ +void mca_pml_ubcl_irecv_start(struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "UBCL_MODULE_IRECV_START %p\n", + (void *) *request)); + + mca_pml_ubcl_request_t *req = container_of((*request), + mca_pml_ubcl_request_t, ompi_req); + void *output_buf = (void *) req->buf; + + ubcl_memory_descriptor_t rbuf_md; + ubcl_error_t err = 0; + size_t size; + + /* Init UBCL MD */ + err = ubcl_memory_descriptor_init(&rbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), "Failed to initialize ubcl MD"); + } + if (pml_ubcl_request_is_cuda_buf(req)) { + err = ubcl_memory_descriptor_set_properties(UBCL_BUF_IS_CUDA, &rbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), + "Failed to set MD properties, got error: %d", err); + } + } + + /* If we don't need to pack we can build a contiguous */ + if (! MCA_PML_UBCL_REQUEST_NEED_XPACK(req)) { + ompi_datatype_type_size(req->datatype, &size); + size *= req->count; + + err = ubcl_memory_descriptor_build_contiguous(output_buf, size, &rbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build memory descriptor for output buffer"); + } + } + + /* Always build a custom MD representation so that we have a fallback */ + err = ubcl_memory_descriptor_build_custom((void *) &req->convertor, + pml_ubcl_datatype_pack, + pml_ubcl_datatype_unpack, + pml_ubcl_datatype_mem_size, + pml_ubcl_datatype_finish, + &rbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build custom memory descriptor for input buffer"); + } + + /* Activate request */ + MCA_PML_UBCL_REQUEST_ACTIVATE(req); + + if (req->message != NULL) { + err = ubcl_imrecv(rbuf_md, (ubcl_message_t **) &req->message, + (ubcl_completion_callback_fct) &ubcl_request_recv_complete_cb, + *request); + } else { + uint64_t rank; + uint64_t cid; + int32_t tag = req->tag; + + if (OMPI_ANY_SOURCE == req->rank) { + rank = UBCL_ANY_SOURCE; + } else { + mca_common_ubcl_endpoint_t *endpoint = NULL; + endpoint = (mca_common_ubcl_endpoint_t *) req->proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + rank = endpoint->rank; + } + + cid = ompi_comm_get_local_cid(req->comm); + ubcl_cid_t ubcl_cid = mca_pml_ubcl_compute_ubcl_cid(req->tag, cid); + tag = req->tag; + + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "PML/UBCL IRECV: recv from rank=%zu\n", rank)); + err = ubcl_irecv(rbuf_md, tag, ubcl_cid, rank, + (ubcl_completion_callback_fct) &ubcl_request_recv_complete_cb, + *request, &req->ubcl_operation_handle); + } + + if (UBCL_ERROR == err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), "Failed to start recv comm"); + } + + /* Optionnal call to progress */ + if (mca_pml_ubcl_component.can_progress) { + opal_progress(); + } +} + +int mca_pml_ubcl_irecv_init(void *buf, size_t count, ompi_datatype_t *datatype, + int src, int tag, struct ompi_communicator_t *comm, + struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_IRECV_INIT\n")); + + /* Create request */ + mca_pml_ubcl_irecv_prepare(buf, count, datatype, src, tag, comm, request, + true, false, NULL); + + return OMPI_SUCCESS; +} + +/** + * Non blocking receive primitive. Get endpoint, allocate a pml request and + * forward to selected communication module + */ +int mca_pml_ubcl_irecv(void *buf, size_t count, ompi_datatype_t *datatype, + int src, int tag, struct ompi_communicator_t *comm, + struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_IRECV\n")); + + /* Create request and start communication */ + mca_pml_ubcl_irecv_prepare(buf, count, datatype, src, tag, comm, request, + false, false, NULL); + mca_pml_ubcl_irecv_start(request); + + return OMPI_SUCCESS; +} + +/** + * Blocking receive primitive. Call non-blocking receive and wait for request + * completion + */ +int mca_pml_ubcl_recv(void *buf, size_t count, ompi_datatype_t *datatype, + int src, int tag, struct ompi_communicator_t *comm, + ompi_status_public_t *status) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_RECV\n")); + + /* Create request and start communication */ + struct ompi_request_t *request = NULL; + int rc = 0; /** TODO: fix return code */ + mca_pml_ubcl_irecv_prepare(buf, count, datatype, src, tag, comm, &request, + false, false, NULL); + mca_pml_ubcl_irecv_start(&request); + + /* Wait for data to be received */ + ompi_request_wait_completion(request); + + mca_pml_ubcl_request_t *req = container_of(request, mca_pml_ubcl_request_t, + ompi_req); + rc = req->ompi_req.req_status.MPI_ERROR; + + if (MPI_STATUS_IGNORE != status) { + OMPI_COPY_STATUS(status, req->ompi_req.req_status, false); + } + + mca_pml_ubcl_request_finalize(req); + + return rc; +} + +int mca_pml_ubcl_imrecv(void *buf, size_t count, ompi_datatype_t *datatype, + struct ompi_message_t **message, + struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_IMRECV\n")); + + /* Create request and start communication */ + mca_pml_ubcl_irecv_prepare(buf, count, datatype, (*message)->peer, + OMPI_ANY_TAG, (*message)->comm, request, + false, true, (*message)->req_ptr); + mca_pml_ubcl_irecv_start(request); + ompi_message_return(*message); + *message = MPI_MESSAGE_NULL; + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_mrecv(void *buf, size_t count, ompi_datatype_t *datatype, + struct ompi_message_t **message, + ompi_status_public_t *status) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_MRECV\n")); + + struct ompi_request_t *request = NULL; + int rc = 0; + //we're matching any message tag + mca_pml_ubcl_irecv_prepare(buf, count, datatype, (*message)->peer, + OMPI_ANY_TAG, (*message)->comm, &request, + false, true, (*message)->req_ptr); + mca_pml_ubcl_irecv_start(&request); + + /* Wait for data to be received */ + ompi_request_wait_completion(request); + + mca_pml_ubcl_request_t *req = container_of(request, mca_pml_ubcl_request_t, + ompi_req); + rc = req->ompi_req.req_status.MPI_ERROR; + + if (MPI_STATUS_IGNORE != status) { + OMPI_COPY_STATUS(status, req->ompi_req.req_status, false); + } + + mca_pml_ubcl_request_finalize(req); + ompi_message_return(*message); + *message = MPI_MESSAGE_NULL; + + return rc; +} + diff --git a/ompi/mca/pml/ubcl/pml_ubcl_isend.c b/ompi/mca/pml/ubcl/pml_ubcl_isend.c new file mode 100644 index 00000000000..9d5b282d884 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_isend.c @@ -0,0 +1,249 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_isend.c + * + * PML/UBCL isend related functions + * + * Functions parameters and return values defined in ompi/mca/pml/pml.h. + */ + +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/constants.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/proc/proc.h" +#include "ompi/mca/pml/base/pml_base_bsend.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/request/request.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ubcl_api.h" + +static inline void get_ubcl_send_mode(mca_pml_base_send_mode_t mode, ubcl_send_mode_t *send_mode) +{ + switch(mode) { + case MCA_PML_BASE_SEND_SYNCHRONOUS: + *send_mode = UBCL_SEND_MODE_SYNCHRONOUS; + break; + case MCA_PML_BASE_SEND_READY: + *send_mode = UBCL_SEND_MODE_READY; + break; + case MCA_PML_BASE_SEND_BUFFERED: + *send_mode = UBCL_SEND_MODE_BUFFERED; + break; + /* Other modes not yet supported in UBCL */ + default: + *send_mode = UBCL_SEND_MODE_STANDARD; + break; + } +} + +/** + * Prepare a request for sending and perform actions according to send mode. + * + * Send modes: + * - BUFFERED = Use a specific user-defined buffer to store buf and return. + * See buffer_attach/detach + * - READY = User tells us that matching receive has already been posted by peer + * - SYNCHRONOUS = Return only when peer has begun to receive + * - STANDARD = BUFFERED or SYNCHRONOUS (up to pml to decide) + * + * By default READY is equivalent to STANDARD, except if checks are enabled by + * MCA: then receiver may print a warning or an error. + * SYNCHRONOUS forces STANDARD rendezvous protocols. + */ +static inline void mca_pml_ubcl_isend_prepare(const void *buf, size_t count, + ompi_datatype_t *datatype, int dst, int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t *comm, + struct ompi_request_t **request, bool persistent) +{ + ompi_proc_t *proc; + mca_pml_ubcl_request_t *req; + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_ISEND_PREPARE\n")); + + /* Get proc */ + proc = ompi_comm_peer_lookup(comm, dst); + if (OPAL_UNLIKELY(NULL == proc)) { + mca_pml_ubcl_error(OMPI_ERROR, "Unknown proc"); + } + + /* Allocate request */ + req = (mca_pml_ubcl_request_t *) opal_free_list_get(&mca_pml_ubcl_component.pml_req_free_list); + if (OPAL_UNLIKELY(NULL == req)) { + mca_pml_ubcl_error(OMPI_ERR_OUT_OF_RESOURCE, "Not enough memory to allocate a PML request"); + } + + /* TODO: Find out what can be simplified in this macro and request structure */ + MCA_PML_UBCL_SEND_REQUEST_INIT(req, buf, count, datatype, dst, tag, mode, comm, proc, + persistent); + + /* Set user request */ + *request = &req->ompi_req; +} + +/** + * Actually start a send request and perform actions according to send mode. + */ +void mca_pml_ubcl_isend_start(struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "UBCL_MODULE_ISEND_START %p\n", *request)); + + mca_pml_ubcl_request_t *req = container_of((*request), mca_pml_ubcl_request_t, ompi_req); + + char *input_buf = NULL; + mca_common_ubcl_endpoint_t *endpoint = NULL; + ubcl_memory_descriptor_t sbuf_md; + ubcl_error_t err = 0; + ubcl_send_mode_t send_mode; + uint64_t cid; + int32_t tag = req->tag; + ubcl_cid_t ubcl_cid; + + /* Activate request */ + MCA_PML_UBCL_REQUEST_ACTIVATE(req); + + if (MCA_PML_BASE_SEND_BUFFERED == req->mode) { + pml_ubcl_bufferize(req); + } + get_ubcl_send_mode(req->mode, &send_mode); + + input_buf = (char*) req->buf; + + /* Retrieve endpoint and compute overall message size */ + endpoint = (mca_common_ubcl_endpoint_t *) req->proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + + /* Init UBCL MD */ + err = ubcl_memory_descriptor_init(&sbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), "Failed to initialize ubcl MD"); + } + if (pml_ubcl_request_is_cuda_buf(req)) { + err = ubcl_memory_descriptor_set_properties(UBCL_BUF_IS_CUDA, &sbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), + "Failed to set MD properties, got error: %d", err); + } + } + + /* If we don't need to pack we can build a contiguous */ + if (! MCA_PML_UBCL_REQUEST_NEED_XPACK(req)) { + ptrdiff_t gap = 0; + size_t span = opal_datatype_span(&req->datatype->super, req->count, &gap); + err = ubcl_memory_descriptor_build_contiguous(input_buf+gap, span, &sbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build contiguous memory descriptor for input buffer"); + } + } + + /* Always build a custom MD representation so that we have a fallback */ + err = ubcl_memory_descriptor_build_custom((void *) &req->convertor, + pml_ubcl_datatype_pack, + pml_ubcl_datatype_unpack, + pml_ubcl_datatype_mem_size, + pml_ubcl_datatype_finish, &sbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build custom memory descriptor for input buffer"); + } + + cid = ompi_comm_get_local_cid(req->comm); + ubcl_cid = mca_pml_ubcl_compute_ubcl_cid(req->tag, cid); + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "PML/UBCL ISEND: send mpi_tag=%x comm_id=%zu\n", tag, ubcl_cid.bits)); + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "PML/UBCL ISEND: ompi_req=%p\n", *request)); + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "PML/UBCL ISEND: sending to rank=%zu\n", endpoint->rank)); + + err = ubcl_isend(sbuf_md, tag, ubcl_cid, endpoint->rank, send_mode, + (ubcl_completion_callback_fct) &ubcl_request_send_complete_cb, + *request, &req->ubcl_operation_handle); + if (UBCL_ERROR == err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), "Failed to send data"); + } + + /* Optionnal call to progress */ + if (mca_pml_ubcl_component.can_progress) { + opal_progress(); + } +} + +/** + * Initialize a permanent send request + */ +int mca_pml_ubcl_isend_init(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, + int tag, mca_pml_base_send_mode_t mode, + struct ompi_communicator_t *comm, struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_ISEND_INIT\n")); + + /* Create request */ + mca_pml_ubcl_isend_prepare(buf, count, datatype, dst, tag, mode, comm, request, true); + + return OMPI_SUCCESS; +} + +/** + * Non-blocking send primitive. Return to user as soon as possible after the + * communication is started. + */ +int mca_pml_ubcl_isend(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, + mca_pml_base_send_mode_t mode, struct ompi_communicator_t *comm, + struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_ISEND\n")); + + /* Create request and start communication */ + mca_pml_ubcl_isend_prepare(buf, count, datatype, dst, tag, mode, comm, request, false); + mca_pml_ubcl_isend_start(request); + + return OMPI_SUCCESS; +} + +/** + * Blocking send primitive. Return only when buffer can be reused by user + * (i.e. either dest has received all or we buffered). + */ +int mca_pml_ubcl_send(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, + mca_pml_base_send_mode_t mode, struct ompi_communicator_t *comm) +{ + int ret; + mca_pml_ubcl_request_t *request = NULL; + struct ompi_request_t *ompi_request; + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_SEND\n")); + + ret = mca_pml_ubcl_isend(buf, count, datatype, dst, tag, mode, comm, &ompi_request); + if (OMPI_SUCCESS != ret || NULL == ompi_request) { + return ret; + } + + request = container_of(ompi_request, mca_pml_ubcl_request_t, ompi_req); + + if (MCA_PML_BASE_SEND_BUFFERED == mode) { + /* MPI specification: Bsend is local, no information about the remote. + * PML/BXI always buffers Bsend data. No need to wait request completion */ + request->to_free = 1; + } else { + ompi_request_wait_completion(ompi_request); + mca_pml_ubcl_request_finalize(request); + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl_progress.c b/ompi/mca/pml/ubcl/pml_ubcl_progress.c new file mode 100644 index 00000000000..688f34bce08 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_progress.c @@ -0,0 +1,38 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2024 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_progress.c + * + * UBCL PML progress related functions + * + * Functions parameters and return values defined in ompi/mca/pml/pml.h. + */ + +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/proc/proc.h" + +#include + +/** + * Forward to communication modules. Could use some weight for priority given + * frequency of call with no event. + */ +int mca_pml_ubcl_progress(void) +{ + if (0 == mca_pml_ubcl_component.nprocs) { + //return OMPI_ERROR; + return OMPI_SUCCESS; + } + + ubcl_progress(); + return OMPI_SUCCESS; +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl_request.c b/ompi/mca/pml/ubcl/pml_ubcl_request.c new file mode 100644 index 00000000000..282f87711a0 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_request.c @@ -0,0 +1,386 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_request.c + * + * UBCL PML Requests + * + * This file holds the MPI matching engine for the pml. It uses expected_list, + * unexpected_list and matched_list from the mca_pml_ubcl_component + * component. Messages come down from the pml interface (isend, irecv, iprobe) + * and up from the communication modules through + * mca_pml_ubcl_request_report_unexpected(). Matching is perform according to the + * norm on the envelop (rank, tag, cid) and in posted order. Note that messages + * on different communicators are still ordered though it is not required. It + * would need additionnal developments. + * + * Function parameters and return values are defined in ompi/request/request.h. + * Following functions are actually used but inside macros and through function + * pointers and are not detected by cppcheck. + */ + +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" + +OBJ_CLASS_INSTANCE(mca_pml_ubcl_request_t, + opal_free_list_item_t, + NULL, + NULL); + +/** + * Start a PML request. Find the mca_pml_ubcl_request with the given ompi_request, + * reset it and start it. + */ +int mca_pml_ubcl_request_start(size_t count, struct ompi_request_t **requests) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "PML/UBCL REQUEST_START %zu\n", count)); + + int ret = OMPI_SUCCESS; + for (size_t i = 0; i < count; i++) { + mca_pml_ubcl_request_t *req = container_of(requests[i], mca_pml_ubcl_request_t, ompi_req); + + /* Save callback fields if they are not ours */ + if(mca_pml_ubcl_request_complete_cb != req->ompi_req.req_complete_cb) { + req->saved_complete_cb = req->ompi_req.req_complete_cb; + req->saved_complete_cb_data = req->ompi_req.req_complete_cb_data; + } else { + /* Else reset fields in case of persistent request */ + req->saved_complete_cb = NULL; + req->saved_complete_cb_data = NULL; + } + + /* Reset fields if persistent request */ + OMPI_REQUEST_INIT(&req->ompi_req, req->ompi_req.req_persistent); + req->ompi_req.req_complete_cb = mca_pml_ubcl_request_complete_cb; + req->completed = 0; + req->message = NULL; + req->prematched_req = NULL; + if (req->is_any_src) { + req->rank = OMPI_ANY_SOURCE; + req->proc = NULL; + opal_convertor_cleanup(&req->convertor); + } else { + size_t offset = 0; + opal_convertor_set_position(&req->convertor, &offset); + } + if (req->is_any_tag) { + req->tag = OMPI_ANY_TAG; + } + + /* Start request */ + if (MCA_PML_UBCL_REQUEST_SEND != req->type) { + /* Recv request */ + mca_pml_ubcl_irecv_start(requests + i); + } else { + /* Send request */ + mca_pml_ubcl_isend_start(requests + i); + } + } + + return ret; +} + +/** + * Free a PML request. Find the mca_pml_bxi_request with the given ompi_request, + * mark it as "to be freed" and finalize if already completed. + */ +int +// cppcheck-suppress unusedFunction +mca_pml_ubcl_request_free(struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "PML/UBCL REQUEST_FREE %p %p\n", + (void *) request, (void *) *request)); + + /* Null check */ + if (MPI_REQUEST_NULL == *request) { + return OMPI_SUCCESS; + } + + mca_pml_ubcl_request_t *req = container_of((*request), mca_pml_ubcl_request_t, ompi_req); + if (!REQUEST_COMPLETE(&(req)->ompi_req) || !(req)->completed) { + /* Free called before complete : mark as "to free" */ + req->to_free = 1; + } else { + mca_pml_ubcl_request_finalize(req); + } + + *request = MPI_REQUEST_NULL; + + return OMPI_SUCCESS; +} + +/** + * Cannot cancel pml requests + */ +int +// cppcheck-suppress unusedFunction +mca_pml_ubcl_request_cancel(struct ompi_request_t *request, int complete) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "PML/UBCL REQUEST_CANCEL\n")); + + mca_pml_ubcl_request_t *req = container_of(request, mca_pml_ubcl_request_t, ompi_req); + bool success = false; + ubcl_error_t err; + + /* This lock cannot be removed, even in thread single mode */ + opal_atomic_lock(&req->req_lock); + switch (req->type) { + case MCA_PML_UBCL_REQUEST_SEND: + /* Cannot cancel send requests */ + break; + case MCA_PML_UBCL_REQUEST_RECV: + if (req->completed) { + /* Cannot cancel completed requests */ + break; + } + if (NULL == req->ubcl_operation_handle) { + /* We did not store operation handle, cannot cancel */ + break; + } + + /* Try to cancel the request */ + err = ubcl_cancel(req->ubcl_operation_handle); + if (UBCL_SUCCESS != err) { + break; + } + + req->completed = true; + success = true; + break; + } + opal_atomic_unlock(&req->req_lock); + + if (!success) { + return OMPI_SUCCESS; + } + + /* If the cancel was successfull, mark the request as cancelled and complete it */ + switch (req->type) { + case MCA_PML_UBCL_REQUEST_SEND: + break; + case MCA_PML_UBCL_REQUEST_RECV: + request->req_status._cancelled = true; + ompi_request_complete(&(req->ompi_req), true); + break; + } + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_request_complete(struct ompi_request_t *request) +{ + /* Null check */ + if (MPI_REQUEST_NULL == request) { + return 0; + } + + mca_pml_ubcl_request_t *req = container_of(request, mca_pml_ubcl_request_t, ompi_req); + + /* If we saved a callback, reset the ompi_request_t fields and call it */ + if (req->saved_complete_cb) { + request->req_complete_cb = req->saved_complete_cb; + request->req_complete_cb_data = req->saved_complete_cb_data; + request->req_complete_cb(request); + } + + if (req->to_free && req->completed) { + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "PML/UBCL REQUEST_COMPLETE CALL FINALIZE")); + mca_pml_ubcl_request_finalize(req); + return 1; + } + + return 0; +} + +/** + * Complete a PML request. Find the mca_pml_ubcl_request with the given + * ompi_request, mark it as "completed" and finalize if already freed. + */ +int mca_pml_ubcl_request_complete_cb(struct ompi_request_t *request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "PML/UBCL REQUEST_COMPLETE CALLBACK CALLED with ompi_req=%p\n", + (void *) request)); + + return mca_pml_ubcl_request_complete(request); +} + +/* TODO: Get a pointer to status and not a cpy ? */ +void ubcl_request_send_complete_cb(ubcl_status_t status, void *cb_data) +{ + if (UBCL_SUCCESS != status.status) { + mca_pml_ubcl_error(OMPI_ERROR, "UBCL error at request completion"); + } + + ompi_request_t *request = (ompi_request_t *) cb_data; + mca_pml_ubcl_request_t *req = container_of(request, mca_pml_ubcl_request_t, ompi_req); + + size_t dt_size; + ompi_datatype_type_size(req->datatype, &dt_size); + + /* This lock cannot be removed, even in thread single mode */ + opal_atomic_lock(&req->req_lock); + req->completed = 1; + opal_atomic_unlock(&req->req_lock); + if (req->is_buffered) { + mca_pml_base_bsend_request_free(req->comm, (void*)req->buf); + /* Bsend started completed, but could not be freed, now that UBCL is + * done the transfer, if MPI_Wait is done, let free it */ + if (req->to_free) { + /* MPI request has already been waited (Bsend) or freed, no one needs it anymore */ + mca_pml_ubcl_request_finalize(req); + } + } else { + /* No need to set a MPI_Status on Send operations */ + /* No need to free the request: completion callbacks will do it */ + ompi_request_complete(&(req->ompi_req), true); + } + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "PML/UBCL SEND_COMPLETE pml_req=%p mpi_tag=%x\n", req, req->tag)); + + /** mca_pml_ubcl_request_complete((ompi_request_t *) cb_data); */ +} + +void ubcl_request_recv_complete_cb(ubcl_status_t status, void *cb_data) +{ + if (UBCL_SUCCESS != status.status) { + if (UBCL_ERR_TRUNCATE == status.status) { + if (mca_pml_ubcl_component.warn_on_truncate + || mca_pml_ubcl_component.abort_on_truncate) { + mca_pml_ubcl_warn(MPI_ERR_TRUNCATE, "Truncation error found during UBCL recv"); + } + if (mca_pml_ubcl_component.abort_on_truncate) { + ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_TRUNCATE); + } + } else { + mca_pml_ubcl_error(OMPI_ERROR, "UBCL error at request completion"); + } + } + + ompi_request_t *request = (ompi_request_t *) cb_data; + mca_pml_ubcl_request_t *req = container_of(request, mca_pml_ubcl_request_t, ompi_req); + + mca_common_ubcl_status_to_ompi(&request->req_status, status, req->comm, req->rank); + if (MPI_STATUS_IGNORE != &request->req_status) { + request->req_status.MPI_ERROR = ubcl_error_to_ompi(status.status); + } + + /* This lock cannot be removed, even in thread single mode */ + opal_atomic_lock(&req->req_lock); + req->completed = 1; + opal_atomic_unlock(&req->req_lock); + ompi_request_complete(&(req->ompi_req), true); + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "PML/UBCL RECV_COMPLETE pml_req=%p mpi_tag=%d\n", req, req->tag)); + + /** mca_pml_ubcl_request_complete((ompi_request_t *) cb_data); */ +} + +/** + * Really cleanup and free request after a call to request_free and + * request_complete + */ +void mca_pml_ubcl_request_finalize(mca_pml_ubcl_request_t *req) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "PML/UBCL REQUEST_FINALIZE BEGIN pml_req=%p mpi_tag=%x\n", req, req->tag)); + + opal_convertor_cleanup(&req->convertor); + OBJ_DESTRUCT(&req->convertor); + OMPI_REQUEST_FINI(&req->ompi_req); + OBJ_RELEASE(req->comm); + OMPI_DATATYPE_RELEASE(req->datatype); + OBJ_DESTRUCT(&req->ompi_req); + + opal_free_list_return(&mca_pml_ubcl_component.pml_req_free_list, (opal_free_list_item_t *) req); + + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "PML/UBCL REQUEST_FINALIZED %p\n", req)); +} + + +bool pml_ubcl_request_is_cuda_buf(mca_pml_ubcl_request_t *req) { + if (!mca_pml_ubcl_component.accelerator_is_cuda) { + return false; + } + + return !!(opal_convertor_on_device(&req->convertor)); +} + +int mca_pml_ubcl_request_need_xpack(mca_pml_ubcl_request_t *req, ubcl_endpoint_type_t type) +{ + int need_buffer; + int is_cuda_buffer; + ubcl_endpoint_capabilities_t *capabilities; + + ompi_datatype_t *datatype = req->datatype; + if (datatype->super.true_lb) { + return 1; + } + + need_buffer = opal_convertor_need_buffers(&req->convertor); + is_cuda_buffer = pml_ubcl_request_is_cuda_buf(req); + + /* If cuda contiguous ptr are allowed, we don't need to pack */ + if (!need_buffer && is_cuda_buffer) { + capabilities = &mca_pml_ubcl_component.endpoint_capabilities[type]; + /* Contiguous cuda buffer */ + if(!capabilities->allow_cuda_contig_ptr + || mca_pml_ubcl_component.force_cuda_custom_dt) { + /* Contiguous cuda ptr not allowed, forcing the use of pack/unpack */ + need_buffer = 1; + } + } + + return need_buffer; +} + +void pml_ubcl_bufferize(mca_pml_ubcl_request_t *req) +{ + if (NULL == req || req->is_buffered) { + return; + } + + void *buffer = NULL; + size_t dt_size, msg_size; + ompi_datatype_type_size(req->datatype, &dt_size); + msg_size = req->count * dt_size; + + /* TODO pack in a buffer on the same device as request buffer */ + buffer = mca_pml_base_bsend_request_alloc_buf(req->comm, msg_size); + if (NULL == buffer) { + mca_pml_ubcl_error(OMPI_ERR_OUT_OF_RESOURCE, + "Buffered mode but no more memory left in attached " + "buffer\n"); + return; + } + + struct iovec iov; + uint32_t iov_count = 1; + size_t max_data; + iov.iov_len = msg_size; + iov.iov_base = (char *) buffer; + opal_convertor_pack(&req->convertor, &iov, &iov_count, &max_data); + req->is_buffered = 1; + req->count = msg_size; + req->datatype = MPI_PACKED; + req->buf = buffer; + req->need_xpack = 0; + + /* Copy is done Bsend is completed. UBCL just have to do the job for real */ + /* No need to set a MPI_Status on Send operations */ + ompi_request_complete(&(req->ompi_req), true); +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl_request.h b/ompi/mca/pml/ubcl/pml_ubcl_request.h new file mode 100644 index 00000000000..d47fa598af8 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_request.h @@ -0,0 +1,311 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_requests.h + * + * UBCL PML Requests + * + * Several specific cases are to be handled with care, namely: + * - Persistant requests: + * Not much but need to be reset at each restart. Some fields are erased + * by OMPI_REQUEST_INIT() and need to be set again. + * - Matching requests (mprobe/mrecv): + * Once matched by a matching probe, an incoming message must be locked + * and can only be received thanks to a corresponding mrecv. Two fields + * are given to allow quick access to ompi_message_t and internal request + * from the pml request. + * - Final trick: + * You can have the following combinations: + * - A persistant any source receive request + * - A matching any source receive request + */ + +#ifndef MCA_PML_UBCL_REQUEST_H +#define MCA_PML_UBCL_REQUEST_H + +#include "ompi/mca/pml/base/base.h" +#include "ompi/mca/pml/pml_constants.h" +#include "ompi/message/message.h" +#include "ompi/proc/proc.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/communicator/communicator.h" +#include "ompi/request/request.h" +#include "ompi/mca/pml/base/pml_base_bsend.h" +#include "opal/include/opal/sys/atomic.h" +#include "opal/mca/common/ubcl/common_ubcl.h" + +#include + +BEGIN_C_DECLS + +/** + * Requests type enum + */ +typedef enum { MCA_PML_UBCL_REQUEST_SEND, MCA_PML_UBCL_REQUEST_RECV } mca_pml_ubcl_request_type_t; + +/** + * Request structure + * + * Fields map the usual MPI calls + */ +struct mca_pml_ubcl_request_t { + opal_free_list_item_t super; + ompi_request_t ompi_req; /**< Base request */ + mca_pml_ubcl_request_type_t type; + + /* PML parameters */ + uint64_t to_free:1; + uint64_t completed:1; + uint64_t need_xpack:1; + uint64_t is_buffered:1; + uint64_t is_buffer_malloced:1; + + /* Any source parameters */ + uint64_t is_any_tag:1; /**< Remember any_tag status for persistant resets */ + uint64_t is_any_src:1; /**< Remember any_src status for persistant resets and + * internal requests cleanup */ + uint64_t pad:57; + + /* MPI API parameters */ + const void *buf; + size_t count; + ompi_datatype_t *datatype; + int rank; /**< src or dest */ + int32_t tag; + int error; /**< Statuts error */ + mca_pml_base_send_mode_t mode; /**< Send mode for send requests */ + struct ompi_communicator_t *comm; /**< Communicator */ + struct ompi_proc_t *proc; /**< Remote ompi proc */ + opal_convertor_t convertor; /**< Data convertor */ + ompi_request_complete_fn_t saved_complete_cb; /**< Saved callback from another component (e.g OSC pt2pt) */ + void *saved_complete_cb_data; /**< Saved callback data from another component (e.g OSC pt2pt) */ + + /* Matching message parameters */ + ompi_message_t *message; + void *prematched_req; /**< Save matched internal request for quick mrecv */ + + /* Cancel/complete concurrency protection */ + opal_atomic_lock_t req_lock; + + /* Operation handle used for cancel */ + void *ubcl_operation_handle; +}; +typedef struct mca_pml_ubcl_request_t mca_pml_ubcl_request_t; +OBJ_CLASS_DECLARATION(mca_pml_ubcl_request_t); + +/** + * Callback functions from request system + */ +int mca_pml_ubcl_request_start(size_t count, struct ompi_request_t **requests); +int mca_pml_ubcl_request_free(struct ompi_request_t **request); +int mca_pml_ubcl_request_cancel(struct ompi_request_t *request, int flag); +int mca_pml_ubcl_request_complete_cb(struct ompi_request_t *request); +void ubcl_request_send_complete_cb(ubcl_status_t status, void *cb_data); +void ubcl_request_recv_complete_cb(ubcl_status_t status, void *cb_data); +void mca_pml_ubcl_request_finalize(mca_pml_ubcl_request_t *req); +int mca_pml_ubcl_request_probe_send(mca_pml_ubcl_request_t *req); +void pml_ubcl_bufferize(mca_pml_ubcl_request_t *req); +bool pml_ubcl_request_is_cuda_buf(mca_pml_ubcl_request_t *req); +int mca_pml_ubcl_request_need_xpack(mca_pml_ubcl_request_t *req, + ubcl_endpoint_type_t type); + +/** + * Requests accessors. + */ +#define MCA_PML_UBCL_REQUEST_ANYSRC(req) ((req)->is_any_src) +#define MCA_PML_UBCL_REQUEST_ANYTAG(req) ((req)->is_any_tag) +#define MCA_PML_UBCL_REQUEST_COMM(req) ((req)->comm) +#define MCA_PML_UBCL_REQUEST_CONVERTOR(req) ((req)->convertor) +#define MCA_PML_UBCL_REQUEST_NEED_XPACK(req) ((req)->need_xpack) +#define MCA_PML_UBCL_REQUEST_IS_ACTIVE(req) (OMPI_REQUEST_ACTIVE == (req)->ompi_req.req_state) + +/** + * Macros for any_source messages. MOSTLY USELESS and can be put in + * pml_ubcl_request_handle_match now that it is the only place where it is called + **/ +#define MCA_PML_UBCL_RECV_REQUEST_UPDATE_SRC(_req, _rank) \ + do { \ + (_req)->rank = _rank; \ + (_req)->proc = ompi_comm_peer_lookup((_req)->comm, (_req)->rank); \ + MCA_PML_UBCL_RECV_REQUEST_CONVERTOR_INIT(_req); \ + } while (0) +#define MCA_PML_UBCL_RECV_REQUEST_UPDATE_TAG(_req, _tag) ((_req)->tag = _tag) + +/** + * Macros to handle MPI matching interface. SAME AS ABOVE, move in corresponding + * function in pml_ubcl_request.c + **/ +#define MCA_PML_UBCL_RECV_REQUEST_PREMATCH(req, _prematched_req, _rank) \ + do { \ + (req)->message->req_ptr = req; \ + (req)->prematched_req = _prematched_req; \ + (req)->rank = _rank; \ + } while (0) +#define MCA_PML_UBCL_RECV_REQUEST_NEED_PREMATCH(req) (NULL != (req)->message) +#define MCA_PML_UBCL_RECV_REQUEST_IS_PREMATCHED(req) (NULL != (req)->prematched_req) +#define MCA_PML_UBCL_RECV_REQUEST_PREMATCHED_REQ(req) ((req)->prematched_req) + +/** + * Generic convinience macros + */ +#define MCA_PML_UBCL_SEND_REQUEST_INIT(req, _buf, _count, _datatype, _dst, _tag, _mode, _comm, \ + _proc, _persistent) \ + do { \ + OBJ_RETAIN(_comm); \ + OMPI_DATATYPE_RETAIN(_datatype); \ + OBJ_CONSTRUCT(&(req)->ompi_req, ompi_request_t); \ + OMPI_REQUEST_INIT(&req->ompi_req, _persistent); \ + (req)->ompi_req.req_type = OMPI_REQUEST_PML; \ + (req)->ompi_req.req_start = mca_pml_ubcl_request_start; \ + (req)->ompi_req.req_free = mca_pml_ubcl_request_free; \ + (req)->ompi_req.req_cancel = mca_pml_ubcl_request_cancel; \ + (req)->ompi_req.req_complete_cb = mca_pml_ubcl_request_complete_cb; \ + (req)->ompi_req.req_mpi_object.comm = _comm; \ + (req)->saved_complete_cb = NULL; \ + (req)->saved_complete_cb_data = NULL; \ + (req)->type = MCA_PML_UBCL_REQUEST_SEND; \ + (req)->to_free = 0; \ + (req)->completed = 0; \ + (req)->is_buffered = 0; \ + (req)->is_buffer_malloced = 0; \ + (req)->buf = _buf; \ + (req)->count = _count; \ + (req)->datatype = _datatype; \ + (req)->rank = _dst; \ + (req)->tag = _tag; \ + (req)->error = MPI_SUCCESS; \ + (req)->mode = _mode; \ + (req)->comm = _comm; \ + (req)->proc = _proc; \ + OBJ_CONSTRUCT(&(req)->convertor, opal_convertor_t); \ + opal_convertor_copy_and_prepare_for_send(_proc->super.proc_convertor, &_datatype->super, \ + _count, _buf, 0, &(req)->convertor); \ + (req)->need_xpack = mca_pml_ubcl_request_need_xpack((req), \ + ((mca_common_ubcl_endpoint_t *)(req)->proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML])->type); \ + (req)->message = NULL; \ + (req)->prematched_req = NULL; \ + (req)->is_any_tag = 0; \ + (req)->is_any_src = 0; \ + opal_atomic_lock_init(&((req)->req_lock), OPAL_ATOMIC_LOCK_UNLOCKED); \ + (req)->ubcl_operation_handle = NULL; \ + } while (0) + +#define MCA_PML_UBCL_RECV_REQUEST_CONVERTOR_INIT(req) \ + do { \ + if ((req)->is_any_src) { \ + /* Remote proc is unknown, let assume its architecture is the same as local proc */ \ + opal_convertor_copy_and_prepare_for_recv(ompi_proc_local()->super.proc_convertor, \ + &(req)->datatype->super, (req)->count, \ + (req)->buf, 0, &(req)->convertor); \ + /* Do not ask for endpoint capabilities and enable by default need_xpack */ \ + (req)->need_xpack = (0 != (req)->datatype->super.true_lb) \ + || opal_convertor_need_buffers(&req->convertor); \ + } else { \ + opal_convertor_copy_and_prepare_for_recv((req)->proc->super.proc_convertor, \ + &(req)->datatype->super, (req)->count, \ + (req)->buf, 0, &(req)->convertor); \ + (req)->need_xpack = mca_pml_ubcl_request_need_xpack((req), \ + ((mca_common_ubcl_endpoint_t *)(req)->proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML])->type); \ + } \ + } while (0) + +#define MCA_PML_UBCL_RECV_REQUEST_INIT(req, _buf, _count, _datatype, _src, \ + _tag, _comm, _proc, _persistent, \ + _probe, _mes) \ + do { \ + OBJ_RETAIN(_comm); \ + OMPI_DATATYPE_RETAIN(_datatype); \ + OBJ_CONSTRUCT(&(req)->ompi_req, ompi_request_t); \ + OMPI_REQUEST_INIT(&req->ompi_req, _persistent); \ + (req)->ompi_req.req_type = OMPI_REQUEST_PML; \ + (req)->ompi_req.req_start = mca_pml_ubcl_request_start; \ + (req)->ompi_req.req_free = mca_pml_ubcl_request_free; \ + (req)->ompi_req.req_cancel = mca_pml_ubcl_request_cancel; \ + (req)->ompi_req.req_complete_cb = mca_pml_ubcl_request_complete_cb; \ + (req)->ompi_req.req_mpi_object.comm = _comm; \ + (req)->saved_complete_cb = NULL; \ + (req)->saved_complete_cb_data = NULL; \ + (req)->type = MCA_PML_UBCL_REQUEST_RECV; \ + (req)->to_free = 0; \ + (req)->completed = 0; \ + (req)->is_buffered = 0; \ + (req)->is_buffer_malloced = 0; \ + (req)->buf = _buf; \ + (req)->count = _count; \ + (req)->datatype = _datatype; \ + (req)->rank = _src; \ + (req)->tag = _tag; \ + (req)->error = MPI_SUCCESS; \ + (req)->mode = MCA_PML_BASE_SEND_SIZE; \ + (req)->comm = _comm; \ + (req)->proc = _proc; \ + OBJ_CONSTRUCT(&(req)->convertor, opal_convertor_t); \ + (req)->message = (void *) _mes; \ + (req)->prematched_req = NULL; \ + (req)->is_any_tag = (_tag == OMPI_ANY_TAG); \ + opal_atomic_lock_init(&((req)->req_lock), OPAL_ATOMIC_LOCK_UNLOCKED); \ + (req)->ubcl_operation_handle = NULL; \ + if (OMPI_ANY_SOURCE == (req)->rank) { \ + (req)->is_any_src = 1; \ + } else { \ + (req)->is_any_src = 0; \ + } \ + MCA_PML_UBCL_RECV_REQUEST_CONVERTOR_INIT(req); \ + } while (0) + +#define MCA_PML_UBCL_RECV_REQUEST_MPROBE_TO_MRECV(req, _buf, _count, _datatype) \ + do { \ + OMPI_DATATYPE_RETAIN(_datatype); \ + (req)->type = MCA_PML_UBCL_REQUEST_RECV; \ + (req)->buf = _buf; \ + (req)->count = _count; \ + (req)->datatype = _datatype; \ + (req)->proc = ompi_comm_peer_lookup((req)->comm, (req)->rank); \ + MCA_PML_UBCL_RECV_REQUEST_CONVERTOR_INIT(req); \ + } while (0) + +#define MCA_PML_UBCL_REQUEST_ACTIVATE(req) \ + do { \ + (req)->ompi_req.req_state = OMPI_REQUEST_ACTIVE; \ + (req)->ompi_req.req_complete = REQUEST_PENDING; \ + (req)->ompi_req.req_status.MPI_SOURCE = OMPI_ANY_SOURCE; \ + (req)->ompi_req.req_status.MPI_TAG = OMPI_ANY_TAG; \ + (req)->ompi_req.req_status.MPI_ERROR = OMPI_SUCCESS; \ + (req)->ompi_req.req_status._ucount = 0; \ + (req)->ompi_req.req_status._cancelled = 0; \ + } while (0) + +#define MCA_PML_UBCL_STATUS_SET(stat, rank, tag, err, size) \ + do { \ + (stat)->MPI_SOURCE = rank; \ + (stat)->MPI_TAG = tag; \ + (stat)->MPI_ERROR = err; \ + (stat)->_ucount = size; \ + (stat)->_cancelled = false; \ + } while (0) + +#define MCA_PML_UBCL_REQUEST_SET_STATUS(req, rank, tag, err, size) \ + do { \ + MCA_PML_UBCL_STATUS_SET(&(req)->ompi_req.req_status, rank, tag, err, size); \ + } while (0) + +#define MCA_PML_UBCL_REQUEST_CPY_STATUS(status, req) \ + do { \ + status->MPI_SOURCE = (req)->ompi_req.req_status.MPI_SOURCE; \ + status->MPI_TAG = (req)->ompi_req.req_status.MPI_TAG; \ + status->MPI_ERROR = (req)->ompi_req.req_status.MPI_ERROR; \ + status->_ucount = (req)->ompi_req.req_status._ucount; \ + status->_cancelled = (req)->ompi_req.req_status._cancelled; \ + } while (0) + +END_C_DECLS + +#endif /* MCA_PML_UBCL_REQUEST_H */ diff --git a/ompi/mca/pml/ubcl/pml_ubcl_utils.c b/ompi/mca/pml/ubcl/pml_ubcl_utils.c new file mode 100644 index 00000000000..cba136dc192 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_utils.c @@ -0,0 +1,43 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2024 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_utils.c + * + * UBCL PML utilities + * + * Contains some usefull fonctions + * + */ + +#include "pml_ubcl_utils.h" +#include "pml_ubcl.h" +#include + +/* Reserve 1 cid bit to prevent MPI_ANY_TAG to match + * messages with negative tag, which are ompi reserved tags + */ +#define CID_RESERVED_BIT (((uint64_t) 1) << 63) + +ubcl_cid_t mca_pml_ubcl_compute_ubcl_cid(int tag, int cid) +{ + ubcl_cid_t ubcl_cid; + ubcl_cid.cid.communicator = cid; + + int is_collective_tag = tag < 0 && MPI_ANY_TAG != tag; + if (is_collective_tag) { + ubcl_cid.cid.runtime = UBCL_CID_MPI_INTERNAL; + } else { + ubcl_cid.cid.runtime = UBCL_CID_MPI_APPLICATION; + } + + return ubcl_cid; +} + diff --git a/ompi/mca/pml/ubcl/pml_ubcl_utils.h b/ompi/mca/pml/ubcl/pml_ubcl_utils.h new file mode 100644 index 00000000000..88c824542e9 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_utils.h @@ -0,0 +1,39 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2024 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_utils.h + * + * UBCL PML + * + * Contains some usefull fonctions + * + */ + +#ifndef MCA_PML_UBCL_UTILS_H +#define MCA_PML_UBCL_UTILS_H + +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "opal/util/output.h" + +#define PML_UBCL_COMP_NAME "PML/UBCL" + +#define mca_pml_ubcl_log(lvl, ...) \ + opal_output_verbose(lvl, mca_pml_ubcl_component.output, __VA_ARGS__) + +#define mca_pml_ubcl_warn(err, format, ...) \ + _mca_common_ubcl_error(__FILE__, __LINE__, err, false, 5, mca_pml_ubcl_component.output, mca_pml_ubcl_component.is_init, mca_pml_ubcl_component.verbose, PML_UBCL_COMP_NAME, format, ##__VA_ARGS__) +#define mca_pml_ubcl_error(err, format, ...) \ + _mca_common_ubcl_error(__FILE__, __LINE__, err, true, 1, mca_pml_ubcl_component.output, mca_pml_ubcl_component.is_init, mca_pml_ubcl_component.verbose, PML_UBCL_COMP_NAME, format, ##__VA_ARGS__) + +ubcl_cid_t mca_pml_ubcl_compute_ubcl_cid(int tag, int cid); + +#endif /*MCA_PML_UBCL_UTILS_H */ diff --git a/ompi/mca/pml/ubcl/post_configure.sh b/ompi/mca/pml/ubcl/post_configure.sh new file mode 100644 index 00000000000..634b9a3f1d8 --- /dev/null +++ b/ompi/mca/pml/ubcl/post_configure.sh @@ -0,0 +1,2 @@ +DIRECT_CALL_HEADER="ompi/mca/pml/ubcl/pml_ubcl.h" +# Copyright (c) 2024 BULL S.A.S. All rights reserved. diff --git a/opal/mca/common/ubcl/Makefile.am b/opal/mca/common/ubcl/Makefile.am new file mode 100644 index 00000000000..d99e62f9652 --- /dev/null +++ b/opal/mca/common/ubcl/Makefile.am @@ -0,0 +1,105 @@ +# +# Copyright (c) 2020-2024 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Note that building this common component statically and linking +# against other dynamic components is *not* supported! + +# Header files + +headers = \ + common_ubcl.h + +# Source files + +sources = \ + common_ubcl.c + +# Help file + +dist_opaldata_DATA = \ + help-mpi-common-ubcl.txt + +# As per above, we'll either have an installable or noinst result. +# The installable one should follow the same MCA prefix naming rules +# (i.e., libmca__.la). The noinst one can be named +# whatever it wants, although libmca___noinst.la is +# recommended. + +# To simplify components that link to this library, we will *always* +# have an output libtool library named libmca__.la -- even +# for case 2) described above (i.e., so there's no conditional logic +# necessary in component Makefile.am's that link to this library). +# Hence, if we're creating a noinst version of this library (i.e., +# case 2), we sym link it to the libmca__.la name +# (libtool will do the Right Things under the covers). See the +# all-local and clean-local rules, below, for how this is effected. + +common_ubcl_CFLAGS= -Werror -Wall + +lib_LTLIBRARIES = +noinst_LTLIBRARIES = +comp_inst = lib@OPAL_LIB_NAME@mca_common_ubcl.la +comp_noinst = lib@OPAL_LIB_NAME@mca_common_ubcl_noinst.la + +if MCA_BUILD_opal_common_ubcl_DSO +lib_LTLIBRARIES += $(comp_inst) +else +noinst_LTLIBRARIES += $(comp_noinst) +endif + +lib@OPAL_LIB_NAME@mca_common_ubcl_la_SOURCES = \ + $(headers) $(sources) +lib@OPAL_LIB_NAME@mca_common_ubcl_la_CFLAGS = \ + $(common_ubcl_CFLAGS) +lib@OPAL_LIB_NAME@mca_common_ubcl_la_CPPFLAGS = \ + $(common_ubcl_CPPFLAGS) +lib@OPAL_LIB_NAME@mca_common_ubcl_la_LDFLAGS = \ + $(common_ubcl_LDFLAGS) +lib@OPAL_LIB_NAME@mca_common_ubcl_la_LIBADD = \ + $(common_ubcl_LIBS) \ + $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la + +lib@OPAL_LIB_NAME@mca_common_ubcl_noinst_la_SOURCES = \ + $(headers) $(sources) +lib@OPAL_LIB_NAME@mca_common_ubcl_noinst_la_CFLAGS = \ + $(common_ubcl_CFLAGS) +lib@OPAL_LIB_NAME@mca_common_ubcl_noinst_la_CPPFLAGS = \ + $(common_ubcl_CPPFLAGS) +lib@OPAL_LIB_NAME@mca_common_ubcl_noinst_la_LDFLAGS = \ + $(common_ubcl_LDFLAGS) +lib@OPAL_LIB_NAME@mca_common_ubcl_noinst_la_LIBADD = \ + $(common_ubcl_LIBS) + +# Conditionally install the header files + +if WANT_INSTALL_HEADERS +opaldir = $(opalincludedir)/$(subdir) +opal_HEADERS = $(headers) +endif + +# These two rules will sym link the "noinst" libtool library filename +# to the installable libtool library filename in the case where we are +# compiling this component statically (case 2), described above). + +# See Makefile.ompi-rules for an explanation of the "V" macros, below +V=0 +OMPI_V_LN_SCOMP = $(ompi__v_LN_SCOMP_$V) +ompi__v_LN_SCOMP_ = $(ompi__v_LN_SCOMP_$AM_DEFAULT_VERBOSITY) +ompi__v_LN_SCOMP_0 = @echo " LN_S " `basename $(comp_inst)`; + +all-local: + $(OMPI_V_LN_SCOMP) if test -z "$(lib_LTLIBRARIES)"; then \ + rm -f "$(comp_inst)"; \ + $(LN_S) "$(comp_noinst)" "$(comp_inst)"; \ + fi + +clean-local: + if test -z "$(lib_LTLIBRARIES)"; then \ + rm -f "$(comp_inst)"; \ + fi diff --git a/opal/mca/common/ubcl/common_ubcl.c b/opal/mca/common/ubcl/common_ubcl.c new file mode 100644 index 00000000000..8b39800ab5b --- /dev/null +++ b/opal/mca/common/ubcl/common_ubcl.c @@ -0,0 +1,445 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "opal_config.h" + +#include +#include +#include +#include + +#include "opal/mca/base/mca_base_var.h" +#include "opal/mca/dl/base/base.h" +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "opal/util/proc.h" +#include "opal/util/show_help.h" + +#include "common_ubcl.h" + +/** + * Common UBCL component + */ +mca_opal_common_ubcl_component_t mca_opal_common_ubcl_component = { + .output = 0, + .verbose = 0, + .ld_library_path_fail_warn = true, + .search_opt_ubcl = true, + .force_ld_lib_dlopen = false, + .ubcl_search_path = NULL, + + .is_init = 0, + .is_registered = 0, + .is_dlopen = 0, +}; +const char *default_search_path = "/opt/ubcl/"; + +/* + * Version of the UBCL API we need + */ +ubcl_api_version_t my_api_version = { + .major = UBCL_API_VERSION_MAJOR, + .minor = UBCL_API_VERSION_MINOR, +}; + +/* Handle to libubcl.so */ +opal_dl_handle_t *libubcl_handle = NULL; + +static int mca_common_ubcl_scandir_filter(const struct dirent *dir) +{ + char* dirname_copy = NULL; + char* saved_ptr = NULL; + char* digit_str = NULL; + char* endptr = NULL; + unsigned long digit = 0; + int digit_position = 0; + + /* Filter out '.' and '..' */ + if (0 == strcmp(dir->d_name, ".") || 0 == strcmp(dir->d_name, "..")) { + return 0; + } + + /* Only keep directories and unknown */ + if (DT_DIR != dir->d_type && DT_UNKNOWN != dir->d_type) { + return 0; + } + + /* Filter out folders that don't look like X.Y.Z */ + dirname_copy = strdup(dir->d_name); + digit_str = strtok_r(dirname_copy, ".", &saved_ptr); + while (digit_str != NULL) { + digit = strtol(digit_str, &endptr, 10); + if (digit_str == endptr) { + common_ubcl_log_verbose(95, "DIGIT: '%s' doesn't start by a number\n", + digit_str); + goto free_and_fail; + } else if ('\0' != *endptr) { + common_ubcl_log_verbose(95, "DIGIT: '%s' contains non-number\n", + digit_str); + goto free_and_fail; + } else { + switch (digit_position) { + case 0: + if (digit != my_api_version.major) { + common_ubcl_log_verbose(95, "Wrong API_MAJOR version: " + "%lu != %u\n", digit, + my_api_version.major); + goto free_and_fail; + } + break; + case 1: + if (digit < my_api_version.minor) { + common_ubcl_log_verbose(95, "Wrong API_MINOR version: " + "%lu < %u\n", digit, + my_api_version.minor); + goto free_and_fail; + } + break; + case 2: + break; + default: + common_ubcl_log_verbose(95, "'%s' has more than 3 digits", + dir->d_name); + goto free_and_fail; + } + } + digit_position++; + digit_str = strtok_r(NULL, ".", &saved_ptr); + } + + free(dirname_copy); + return 1; + +free_and_fail: + common_ubcl_log_verbose(95, "Filtering out '%s'", dir->d_name); + free(dirname_copy); + return 0; +} + +static int mca_common_ubcl_find_ubcl_install(char*** searchpaths) +{ + int nb_dir, i; + int inv_i; + struct dirent **verslist; + const char* ubcl_search_path = *mca_opal_common_ubcl_component.ubcl_search_path; + + nb_dir = scandir(ubcl_search_path, &verslist, mca_common_ubcl_scandir_filter, versionsort); + if (-1 == nb_dir) { + common_ubcl_warning("Failed to scan %s, error: %s", ubcl_search_path, strerror(errno)); + return nb_dir; + } + + /* Allocate two more to append the search path itself and 'NULL' as the last */ + (*searchpaths) = malloc( (2 + nb_dir) * sizeof(char*)); + asprintf((*searchpaths)+nb_dir, "%s/lib", ubcl_search_path); + (*searchpaths)[nb_dir + 1] = NULL; + + /* Iterate backwards to get higher versions first */ + inv_i = 0; + for (i = nb_dir - 1; i >= 0; i--) { + asprintf((*searchpaths)+inv_i, "%s/%s/lib", ubcl_search_path, verslist[i]->d_name); + free(verslist[i]); + inv_i++; + } + free(verslist); + + return nb_dir; +} + +static void mca_common_ubcl_free_found_searchpaths(char*** searchpaths, int nb_dir) { + int i; + + for (i = 0; i < nb_dir; i++) { + free((*searchpaths)[i]); + } + free((*searchpaths)[nb_dir]); + + free(*searchpaths); + (*searchpaths) = NULL; +} + +static bool mca_common_ubcl_test_lib_version(char* filename) { + int ret; + char *err_msg; + const char *ubcl_api_symbol = "ubcl_api_version"; + ubcl_api_version_t ubcl_lib_api_version; + void *symbol = NULL; + + ret = opal_dl_lookup(libubcl_handle, ubcl_api_symbol, &symbol, &err_msg); + if (OPAL_SUCCESS != ret) { + common_ubcl_warning("Library %s opened but no %s symbols found." + " It probably is an older version, skipping.\n", + filename, ubcl_api_symbol); + return OPAL_ERROR; + } + ubcl_lib_api_version = *(ubcl_api_version_t*)symbol; + + if (ubcl_lib_api_version.major != my_api_version.major) { + common_ubcl_warning("Library %s opened but API version major digit" + " '%d' isn't the wanted: '%d'. Skipping\n", + filename, ubcl_lib_api_version.major, my_api_version.major); + return OPAL_ERROR; + } + + if (ubcl_lib_api_version.minor < my_api_version.minor) { + common_ubcl_warning("Library %s opened but API version minor '%d' " + "inferior to the minimum wanted: '%d'. Skipping\n", + filename, ubcl_lib_api_version.minor, my_api_version.minor); + return OPAL_ERROR; + } + + common_ubcl_log_verbose(20, " Accepting library %s with API version: '%d.%d'," + " (wanted: '%d.%d')\n", filename, + ubcl_lib_api_version.major, ubcl_lib_api_version.minor, + my_api_version.major, my_api_version.minor); + return OPAL_SUCCESS; +} + +static bool mca_common_ubcl_try_dlopen(char** searchpaths, char** ubcllibs, char*** errmsgs) { + int retval; + int errsize; + bool dlopen_success = false; + int j = 0; + + while (searchpaths[j] != NULL) { + int i = 0; + while (ubcllibs[i] != NULL) { + char *filename = NULL; + char *str = NULL; + + /* If there's a non-empty search path, prepend it + to the library filename */ + if (strlen(searchpaths[j]) > 0) { + asprintf(&filename, "%s/%s", searchpaths[j], ubcllibs[i]); + } else { + filename = strdup(ubcllibs[i]); + } + if (NULL == filename) { + opal_show_help("help-mpi-common-ubcl.txt", "No memory", + true, OPAL_PROC_MY_HOSTNAME); + return OPAL_ERR_NOT_AVAILABLE; + } + + retval = opal_dl_open(filename, false, false, + &libubcl_handle, &str); + if (OPAL_SUCCESS != retval || NULL == libubcl_handle) { + if (NULL != str) { + opal_argv_append(&errsize, errmsgs, str); + } else { + opal_argv_append(&errsize, errmsgs, + "opal_dl_open() returned NULL."); + } + common_ubcl_log_verbose(10, "UBCL: Library open error: %s", + (*errmsgs)[errsize-1]); + } else { + if (mca_opal_common_ubcl_component.force_ld_lib_dlopen) { + /* Force retval to fake a good version check */ + retval = OPAL_SUCCESS; + } else { + /* We opened an UBCL library, now we need to check the version */ + retval = mca_common_ubcl_test_lib_version(filename); + } + + if (OPAL_SUCCESS != retval) { + asprintf(&str, "%s opened but version check failed. Skipping", filename); + opal_argv_append(&errsize, errmsgs, str); + opal_dl_close(libubcl_handle); + libubcl_handle = NULL; + } else { + common_ubcl_log_verbose(10, "UBCL: Library successfully " + "opened %s", filename); + dlopen_success = true; + free(filename); + break; + } + } + i++; + + free(filename); + } + if (true == dlopen_success) { + break; /* Break out of outer loop */ + } + j++; + } + return dlopen_success; +} + +static int mca_common_ubcl_dlopen_ubcl(void) +{ + char *ubcllibs[] = { "libubcl.so", "libubcl.so.0", NULL }; + char *searchpaths[] = { "", NULL }; + char **opt_searchpaths = NULL; + char **errmsgs = NULL; + char *errmsg = NULL; + bool dlopen_success = false; + int nb_dir = 0; + + if (1 < opal_atomic_add_fetch_32(&mca_opal_common_ubcl_component.is_dlopen, 1)) { + return OPAL_SUCCESS; + } + + if (!OPAL_HAVE_DL_SUPPORT) { + opal_show_help("help-mpi-common-ubcl.txt", "dlopen disabled", true); + return OPAL_ERR_NOT_AVAILABLE; + } + + common_ubcl_log_verbose(10, "COMMMON_UBCL: Starting to look for UBCL" + " library"); + + + /* Now walk through all the potential names libubcl and find one that + * works. If it does, all is good. If not, print out all the messages about + * why things failed. This code was careful to try and save away all error + * messages if the loading ultimately failed to help with debugging. */ + + + /* On the first try we just utilize the default loading paths from + * the system. This is so that LD_LIBRARY_PATH is looked at in priority */ + dlopen_success = mca_common_ubcl_try_dlopen(searchpaths, ubcllibs, &errmsgs); + + if (true == dlopen_success) { + goto success; + } + + if (mca_opal_common_ubcl_component.ld_library_path_fail_warn) { + common_ubcl_warning("We did not find a compatible UBCL in LD_LIBRARY_PATH\n"); + } + + if (mca_opal_common_ubcl_component.force_ld_lib_dlopen) { + common_ubcl_error("No UBCL found in LD_LIBRARY_PATH and 'force_ld_lib_dlopen'" + " set to 'true'. We cannot load UBCL for the PML/UBCL to use\n"); + goto failed; + } + + if (!mca_opal_common_ubcl_component.search_opt_ubcl) { + common_ubcl_error("No UBCL found in LD_LIBRARY_PATH and 'search_opt_ubcl'" + " set to 'false'. We cannot load UBCL for the PML/UBCL to use\n"); + goto failed; + } + + nb_dir = mca_common_ubcl_find_ubcl_install(&opt_searchpaths); + + if (-1 == nb_dir) { + goto failed; + } + + /* Now look into paths found by 'find_ubcl_install' */ + dlopen_success = mca_common_ubcl_try_dlopen(opt_searchpaths, ubcllibs, &errmsgs); + mca_common_ubcl_free_found_searchpaths(&opt_searchpaths, nb_dir); + + if (true == dlopen_success) { + goto success; + } + +failed: + errmsg = opal_argv_join(errmsgs, '\n'); + opal_show_help("help-mpi-common-ubcl.txt", "dlopen failed", true, + errmsg); + opal_argv_free(errmsgs); + free(errmsg); + return OPAL_ERR_NOT_AVAILABLE; + +success: + opal_argv_free(errmsgs); + free(errmsg); + return OPAL_SUCCESS; +} + +void mca_common_ubcl_register_mca(void) +{ + if (1 < opal_atomic_add_fetch_32(&mca_opal_common_ubcl_component.is_registered, 1)) { + return; + } + MCA_REGISTER_COMMON_UBCL("verbose", "Verbosity level of component common/ubcl", + MCA_BASE_VAR_TYPE_INT, &mca_opal_common_ubcl_component.verbose); + MCA_REGISTER_COMMON_UBCL("ld_lib_path_fail_warn", + "Warn the user when no fitting libraries were found" + " in the default system loading path (LD_LIBRARY_PATH)", + MCA_BASE_VAR_TYPE_BOOL, &mca_opal_common_ubcl_component.ld_library_path_fail_warn); + MCA_REGISTER_COMMON_UBCL("force_ld_lib_dlopen", + "Force comon/ubcl to dlopen and use an UBCL library" + " found in LD_LIBRARY_PATH, regardless of API version", + MCA_BASE_VAR_TYPE_BOOL, &mca_opal_common_ubcl_component.force_ld_lib_dlopen); + MCA_REGISTER_COMMON_UBCL("search_opt_ubcl", + "In case we don't find a suitable UBCL library in " + "LD_LIBRARY_PATH, automatically search /opt/ubcl for compatible UBCL", + MCA_BASE_VAR_TYPE_BOOL, &mca_opal_common_ubcl_component.search_opt_ubcl); + + // Extra level of string indirection needed to make ompi_info + // happy since it will unload this library before the MCA base + // cleans up the MCA vars. This will cause the string to go + // out of scope unless we place the pointer to it on the heap. + mca_opal_common_ubcl_component.ubcl_search_path = malloc(sizeof(char*)); + *mca_opal_common_ubcl_component.ubcl_search_path = default_search_path; + MCA_REGISTER_COMMON_UBCL("ubcl_search_path", + "When 'search_opt_ubcl' is true, search for UBCL" + " version directories at this path", + MCA_BASE_VAR_TYPE_STRING, mca_opal_common_ubcl_component.ubcl_search_path); +} + +int mca_common_ubcl_init(void) +{ + int ret; + + /* Safe guard for multiple init/fini */ + if (1 < opal_atomic_add_fetch_32(&mca_opal_common_ubcl_component.is_init, 1)) { + /* UBCL already init */ + return OPAL_SUCCESS; + } + + /* Open output stream */ + if (0 <= mca_opal_common_ubcl_component.verbose) { + mca_opal_common_ubcl_component.output = opal_output_open(NULL); + opal_output_set_verbosity(mca_opal_common_ubcl_component.output, + mca_opal_common_ubcl_component.verbose); + common_ubcl_log_verbose(10, "Opening common/ubcl component\n"); + } else { + mca_opal_common_ubcl_component.output = -1; + } + + /* Initializing modules */ + ret = mca_common_ubcl_dlopen_ubcl(); + + if (ret != OPAL_SUCCESS) { + common_ubcl_error("Could not dlopen UBCL"); + } + + return ret; +} + +int mca_common_ubcl_fini(void) +{ + int ret; + uint32_t refcount; + + /* Safe guard for multiple init/fini */ + refcount = opal_atomic_fetch_sub_32(&mca_opal_common_ubcl_component.is_init, 1); + assert (0 < refcount); + + if (1 < refcount) { + /* Not the last 'fini' */ + return OPAL_SUCCESS; + } + + common_ubcl_log_verbose(10, "Closing common/ubcl component\n"); + + /* Closing output */ + if (0 < mca_opal_common_ubcl_component.verbose) { + opal_output_close(mca_opal_common_ubcl_component.output); + } + + ret = opal_dl_close(libubcl_handle); + + return ret; +} + +int mca_common_ubcl_is_init(void) { + return (int) opal_atomic_add_fetch_32(&mca_opal_common_ubcl_component.is_init, 0); +} diff --git a/opal/mca/common/ubcl/common_ubcl.h b/opal/mca/common/ubcl/common_ubcl.h new file mode 100644 index 00000000000..05eeccfaa2f --- /dev/null +++ b/opal/mca/common/ubcl/common_ubcl.h @@ -0,0 +1,72 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OPAL_MCA_COMMON_UBCL_H +#define OPAL_MCA_COMMON_UBCL_H + +#include "opal/mca/mca.h" +#include "opal/class/opal_list.h" +#include "opal/util/show_help.h" +#include + +#define MCA_REGISTER_COMMON_UBCL(name, desc, type, var) \ + mca_base_var_register("ompi", "mpi", "common_ubcl", name, desc, type, NULL, 0, \ + MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL, \ + var) + +BEGIN_C_DECLS + +struct mca_opal_common_ubcl_component_t { + /* MCA params */ + int output; + int verbose; + bool gdb_attach; + bool ld_library_path_fail_warn; + bool search_opt_ubcl; + bool force_ld_lib_dlopen; + const char** ubcl_search_path; + + /* Miscellaneous */ + int32_t is_init; + int32_t is_registered; + int32_t is_dlopen; +}; +typedef struct mca_opal_common_ubcl_component_t mca_opal_common_ubcl_component_t; +OPAL_DECLSPEC extern mca_opal_common_ubcl_component_t mca_opal_common_ubcl_component; + +struct mca_common_ubcl_endpoint_t { + uint32_t type; + int32_t refcount; + uint64_t rank; +}; +typedef struct mca_common_ubcl_endpoint_t mca_common_ubcl_endpoint_t; + + +void mca_common_ubcl_register_mca(void); +int mca_common_ubcl_init(void); +int mca_common_ubcl_fini(void); +int mca_common_ubcl_is_init(void); + +#define common_ubcl_generic(__token, ...) \ + opal_output(mca_opal_common_ubcl_component.output, "[COMMON/UBCL] "__token __VA_ARGS__) + +#define common_ubcl_error(...) common_ubcl_generic("ERROR: ", __VA_ARGS__) +#define common_ubcl_warning(...) common_ubcl_generic("WARNING: ", __VA_ARGS__) +#define common_ubcl_log(...) common_ubcl_generic(" ", __VA_ARGS__) + +#define common_ubcl_log_verbose(__lvl, ...) \ + opal_output_verbose(__lvl, mca_opal_common_ubcl_component.output, "[COMMON/UBCL] "__VA_ARGS__) + +#define mca_common_ubcl_help(name, ...) \ + opal_show_help("help-mpi-common-ubcl.txt", name, true, "[COMMON/UBCL]", ##__VA_ARGS__) + +END_C_DECLS + +#endif /* OPAL_MCA_COMMON_UBCL_H */ diff --git a/opal/mca/common/ubcl/configure.m4 b/opal/mca/common/ubcl/configure.m4 new file mode 100644 index 00000000000..d98ebf43103 --- /dev/null +++ b/opal/mca/common/ubcl/configure.m4 @@ -0,0 +1,27 @@ +# -*- shell-script -*- +# +# Copyright (c) 2024 Bull S.A.S. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AC_DEFUN([MCA_opal_common_ubcl_CONFIG],[ + AC_CONFIG_FILES([opal/mca/common/ubcl/Makefile]) + + OMPI_CHECK_UBCL([common_ubcl], + [common_ubcl_happy="yes"], + [common_ubcl_happy="no"]) + + + AS_IF([test "$common_ubcl_happy" = "yes"], + [$1], + [$2]) + + # substitute in the things needed to build ubcl + AC_SUBST([common_ubcl_CPPFLAGS]) + AC_SUBST([common_ubcl_LDFLAGS]) + AC_SUBST([common_ubcl_LIBS]) +])dnl diff --git a/opal/mca/common/ubcl/help-mpi-common-ubcl.txt b/opal/mca/common/ubcl/help-mpi-common-ubcl.txt new file mode 100644 index 00000000000..90b1ddec53f --- /dev/null +++ b/opal/mca/common/ubcl/help-mpi-common-ubcl.txt @@ -0,0 +1,28 @@ +# -*- text -*- +# +# Copyright (c) 2024 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the English help file for Open MPI's BXI support. +# +[dlopen disabled] +Open MPI was compiled without dynamic library support (e.g., with the + --disable-dlopen flag), and therefore cannot utilize UBCL support. + +If you need UBCL support, reconfigure Open MPI with dynamic library support enabled. +# +[dlopen failed] +The library attempted to open the following UBCL libraries, +but each of them failed. UBCL cannot be used. +%s +If you need to use UBCL, then try setting LD_LIBRARY_PATH to the location +of libubcl.so to get passed this issue. +# +[No memory] +A call to allocate memory within the UBCL support failed. This is +an unrecoverable error and will cause the program to abort. + Hostname: %s From 87335ba8cdda0adda26e6a1dd3b3457c205cbdba Mon Sep 17 00:00:00 2001 From: Tomislav Janjusic Date: Sat, 25 Oct 2025 01:05:43 +0300 Subject: [PATCH 07/51] coll: Remove hcoll component This commit completely removes the mca/coll/hcoll component from the Open MPI source code. The hcoll component provided integration with Mellanox's Hierarchical Collectives library for collective operation offload. Changes include: - Removed ompi/mca/coll/hcoll component directory and all source files - Removed config/ompi_check_libhcoll.m4 configuration macro - Updated coll_tags.h to remove HCOLL tag space definitions - Updated coll_base_allgather.c and coll_base_allgatherv.c to use NEIGHBOR tag space instead of HCOLL tag space - Removed hcoll references from platform configuration files - Removed hcoll documentation and configuration options - Removed hcoll references from code comments Signed-off-by: Tomislav Janjusic --- .ci/mellanox/README.md | 2 +- config/ompi_check_libhcoll.m4 | 61 -- contrib/amca-param-sets/ft-mpi | 4 +- contrib/platform/intel/bend/linux | 2 +- contrib/platform/lanl/toss/README.md | 1 - .../lanl/toss/toss2-mlx-optimized.conf | 1 - contrib/platform/mellanox/optimized | 5 - docs/features/ulfm.rst | 2 +- .../configure-cli-options/networking.rst | 9 - docs/tuning-apps/coll-tuned.rst | 2 +- docs/tuning-apps/networking/cuda.rst | 28 - ompi/mca/coll/base/coll_base_allgather.c | 4 +- ompi/mca/coll/base/coll_base_allgatherv.c | 4 +- ompi/mca/coll/base/coll_tags.h | 4 +- ompi/mca/coll/hcoll/Makefile.am | 50 - ompi/mca/coll/hcoll/coll_hcoll.h | 342 ------- ompi/mca/coll/hcoll/coll_hcoll_component.c | 251 ----- ompi/mca/coll/hcoll/coll_hcoll_debug.h | 35 - ompi/mca/coll/hcoll/coll_hcoll_dtypes.h | 281 ------ ompi/mca/coll/hcoll/coll_hcoll_module.c | 478 --------- ompi/mca/coll/hcoll/coll_hcoll_ops.c | 945 ------------------ ompi/mca/coll/hcoll/coll_hcoll_rte.c | 487 --------- ompi/mca/coll/hcoll/configure.m4 | 38 - ompi/mca/coll/hcoll/owner.txt | 7 - ompi/op/op.c | 4 - oshmem/mca/scoll/basic/scoll_basic.h | 2 +- 26 files changed, 12 insertions(+), 3037 deletions(-) delete mode 100644 config/ompi_check_libhcoll.m4 delete mode 100644 ompi/mca/coll/hcoll/Makefile.am delete mode 100644 ompi/mca/coll/hcoll/coll_hcoll.h delete mode 100644 ompi/mca/coll/hcoll/coll_hcoll_component.c delete mode 100644 ompi/mca/coll/hcoll/coll_hcoll_debug.h delete mode 100644 ompi/mca/coll/hcoll/coll_hcoll_dtypes.h delete mode 100644 ompi/mca/coll/hcoll/coll_hcoll_module.c delete mode 100644 ompi/mca/coll/hcoll/coll_hcoll_ops.c delete mode 100644 ompi/mca/coll/hcoll/coll_hcoll_rte.c delete mode 100644 ompi/mca/coll/hcoll/configure.m4 delete mode 100644 ompi/mca/coll/hcoll/owner.txt diff --git a/.ci/mellanox/README.md b/.ci/mellanox/README.md index 2a9d5c09b15..c41229021ff 100644 --- a/.ci/mellanox/README.md +++ b/.ci/mellanox/README.md @@ -6,7 +6,7 @@ CI is managed by [Azure Pipelines](https://docs.microsoft.com/en-us/azure/devops/pipelines/?view=azure-devops) service. Mellanox Open MPI CI includes: -* Open MPI building with internal stable engineering versions of UCX and HCOLL. The building is run in Docker-based environment. +* Open MPI building with internal stable engineering versions of UCX. The building is run in Docker-based environment. * Sanity functional testing. ### How to Run CI Mellanox Open MPI CI is triggered upon the following events: diff --git a/config/ompi_check_libhcoll.m4 b/config/ompi_check_libhcoll.m4 deleted file mode 100644 index 2151148ab14..00000000000 --- a/config/ompi_check_libhcoll.m4 +++ /dev/null @@ -1,61 +0,0 @@ -dnl -*- shell-script -*- -dnl -dnl Copyright (c) 2011 Mellanox Technologies. All rights reserved. -dnl Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. -dnl Copyright (c) 2015 Research Organization for Information Science -dnl and Technology (RIST). All rights reserved. -dnl $COPYRIGHT$ -dnl -dnl Additional copyrights may follow -dnl -dnl $HEADER$ -dnl - -# OMPI_CHECK_HCOLL(prefix, [action-if-found], [action-if-not-found]) -# -------------------------------------------------------- -# check if hcoll support can be found. sets prefix_{CPPFLAGS, -# LDFLAGS, LIBS} as needed and runs action-if-found if there is -# support, otherwise executes action-if-not-found -AC_DEFUN([OMPI_CHECK_HCOLL],[ - OPAL_VAR_SCOPE_PUSH([ompi_check_hcoll_happy CPPFLAGS_save LDFLAGS_save LIBS_save]) - - AC_ARG_WITH([hcoll], - [AS_HELP_STRING([--with-hcoll(=DIR)], - [Build hcoll (Mellanox Hierarchical Collectives) support, optionally adding - DIR/include and DIR/lib or DIR/lib64 to the search path for headers and libraries])]) - - OAC_CHECK_PACKAGE([hcoll], - [$1], - [hcoll/api/hcoll_api.h], - [hcoll], - [hcoll_get_version], - [ompi_check_hcoll_happy="yes"], - [ompi_check_hcoll_happy="no"]) - - AS_IF([test "$ompi_check_hcoll_happy" = "yes"], - [CPPFLAGS_save=$CPPFLAGS - LDFLAGS_save=$LDFLAGS - LIBS_save=$LIBS - - CPPFLAGS="${$1_CPPFLAGS} ${CPPFLAGS}" - LDFLAGS="${$1_LDFLAGS} ${LDFLAGS}" - LIBS="${$1_LIBS} ${LIBS}" - - AC_CHECK_FUNCS(hcoll_context_free, [], []) - - CPPFLAGS=$CPPFLAGS_save - LDFLAGS=$LDFLAGS_save - LIBS=$LIBS_save]) - - AS_IF([test "$ompi_check_hcoll_happy" = "yes" && test "$enable_progress_threads" = "yes"], - [AC_MSG_WARN([hcoll driver does not currently support progress threads. Disabling HCOLL.]) - ompi_check_hcoll_happy="no"]) - - AS_IF([test "$ompi_check_hcoll_happy" = "yes"], - [$2], - [AS_IF([test ! -z "$with_hcoll" && test "$with_hcoll" != "no"], - [AC_MSG_ERROR([HCOLL support requested but not found. Aborting])]) - $3]) - - OPAL_VAR_SCOPE_POP -]) diff --git a/contrib/amca-param-sets/ft-mpi b/contrib/amca-param-sets/ft-mpi index 45eac4c39ee..8ff6da9c9d2 100644 --- a/contrib/amca-param-sets/ft-mpi +++ b/contrib/amca-param-sets/ft-mpi @@ -63,7 +63,7 @@ btl=^usnic # The following frameworks/components are UNTESTED, and probably won't work. # They should run without faults, and will probably crash/deadlock after a fault. # You may try at your own risk. -# coll hcoll, portals4 +# coll portals4 # topo (all) # osc (all) # io (all) @@ -72,7 +72,7 @@ btl=^usnic # We will disable only the components for which good components are known to exist. # Other untested components are selectable but will issue a runtime warning at # initiation if FT is enabled. -coll=^hcoll,portals4 +coll=^portals4 # # The following frameworks/components are NOT WORKING. Do not enable these with FT. diff --git a/contrib/platform/intel/bend/linux b/contrib/platform/intel/bend/linux index 10580121a58..85c540b8094 100644 --- a/contrib/platform/intel/bend/linux +++ b/contrib/platform/intel/bend/linux @@ -13,7 +13,7 @@ enable_ipv6=no enable_man_pages=no enable_mpi_fortran=no enable_memchecker=no -enable_mca_no_build=memchecker,coll-adapt,coll-cuda,coll-demo,coll-ftagree,coll-han,coll-hcoll,coll-inter,coll-libnbc,coll-monitoring,coll-portals4,coll-tuned,common-monitoring,common-ompio,fbtl,fcoll,fs,io,mtl,osc,pml-cm,pml-monitoring,pml-ucx,pml-v,sharedfp,topo,vprotocol,btl-ofi,btl-portals4,btl-smcuda,btl-uct,btl-ugni,btl-usnic,common-cuda,common-ofi,common-ucx +enable_mca_no_build=memchecker,coll-adapt,coll-cuda,coll-demo,coll-ftagree,coll-han,coll-inter,coll-libnbc,coll-monitoring,coll-portals4,coll-tuned,common-monitoring,common-ompio,fbtl,fcoll,fs,io,mtl,osc,pml-cm,pml-monitoring,pml-ucx,pml-v,sharedfp,topo,vprotocol,btl-ofi,btl-portals4,btl-smcuda,btl-uct,btl-ugni,btl-usnic,common-cuda,common-ofi,common-ucx enable_contrib_no_build=libompitrace with_memory_manager=no with_devel_headers=yes diff --git a/contrib/platform/lanl/toss/README.md b/contrib/platform/lanl/toss/README.md index d677de18ff6..0a83c5b29b4 100644 --- a/contrib/platform/lanl/toss/README.md +++ b/contrib/platform/lanl/toss/README.md @@ -43,7 +43,6 @@ created. (change S to X; make sure numbers match those for the same entry in contrib/platform/lanl/toss/optimized-mlx.conf) - addition: pml = ob1 (disable MXM) - - addition: coll = ^hcoll (disable MXM) - toss3-hfi-optimized - copy of toss2-qib-optimized - toss3-hfi-optimized.conf diff --git a/contrib/platform/lanl/toss/toss2-mlx-optimized.conf b/contrib/platform/lanl/toss/toss2-mlx-optimized.conf index b44452760d7..9148f8d51f5 100644 --- a/contrib/platform/lanl/toss/toss2-mlx-optimized.conf +++ b/contrib/platform/lanl/toss/toss2-mlx-optimized.conf @@ -106,4 +106,3 @@ ras_base_launch_orted_on_hn = true ## Disable MXM pml = ob1 -coll = ^hcoll diff --git a/contrib/platform/mellanox/optimized b/contrib/platform/mellanox/optimized index fdde7cfc15f..f75b8e09c0f 100644 --- a/contrib/platform/mellanox/optimized +++ b/contrib/platform/mellanox/optimized @@ -22,11 +22,6 @@ if [ "$mellanox_autodetect" == "yes" ]; then with_ucx=$ucx_dir fi - hcoll_dir=${hcoll_dir:="$(pkg-config --variable=prefix hcoll)"} - if [ -d $hcoll_dir ]; then - with_hcoll=$hcoll_dir - fi - slurm_dir=${slurm_dir:="/usr"} if [ -f $slurm_dir/include/slurm/slurm.h ]; then with_slurm=$slurm_dir diff --git a/docs/features/ulfm.rst b/docs/features/ulfm.rst index 86815b7e435..6cb0acdd006 100644 --- a/docs/features/ulfm.rst +++ b/docs/features/ulfm.rst @@ -333,7 +333,7 @@ correctly after a failure. * ``cuda``, ``inter``, ``sync``, ``sm``: **untested** (they have not been modified to handle faults, but we expect correct post-fault behavior) - * ``hcoll``, ``portals4`` **disabled** (they have not been modified + * ``portals4`` **disabled** (it has not been modified to handle faults, and we expect unspecified post-fault behavior) * ``osc``: MPI one-sided communications diff --git a/docs/installing-open-mpi/configure-cli-options/networking.rst b/docs/installing-open-mpi/configure-cli-options/networking.rst index cabb723a482..d5ff8108929 100644 --- a/docs/installing-open-mpi/configure-cli-options/networking.rst +++ b/docs/installing-open-mpi/configure-cli-options/networking.rst @@ -14,15 +14,6 @@ can be used with ``configure``: FCA is the support library for Mellanox switches and HCAs. -* ``--with-hcoll=DIR``: - Specify the directory where the Mellanox hcoll library and header - files are located. This option is generally only necessary if the - hcoll headers and libraries are not in default compiler/linker - search paths. - - hcoll is the support library for MPI collective operation offload on - Mellanox ConnectX-3 HCAs (and later). - * ``--with-knem=DIR``: Specify the directory where the knem libraries and header files are located. This option is generally only necessary if the knem headers diff --git a/docs/tuning-apps/coll-tuned.rst b/docs/tuning-apps/coll-tuned.rst index fa9c7ba7236..1d5549256d8 100644 --- a/docs/tuning-apps/coll-tuned.rst +++ b/docs/tuning-apps/coll-tuned.rst @@ -3,7 +3,7 @@ Tuning Collectives Open MPI's ``coll`` framework provides a number of components implementing collective communication, including: ``han``, ``libnbc``, ``self``, ``ucc`` ``base``, -``hcoll``, ``sync``, ``xhc``, ``accelerator``, ``basic``, ``ftagree``, ``inter``, ``portals4``, +``sync``, ``xhc``, ``accelerator``, ``basic``, ``ftagree``, ``inter``, ``portals4``, and ``tuned``. Some of these components may not be available depending on how Open MPI was compiled and what hardware is available on the system. A run-time decision based on each component's self reported priority, selects which diff --git a/docs/tuning-apps/networking/cuda.rst b/docs/tuning-apps/networking/cuda.rst index 62e85fea4fd..8a393b3f32f 100644 --- a/docs/tuning-apps/networking/cuda.rst +++ b/docs/tuning-apps/networking/cuda.rst @@ -155,7 +155,6 @@ CUDA-aware support is available in: * The OFI (``ofi``) MTL with the CM (``cm``) PML. * Both CUDA-ized shared memory (``smcuda``) and TCP (``tcp``) BTLs with the OB1 (``ob1``) PML. -* The HCOLL (``hcoll``) COLL ///////////////////////////////////////////////////////////////////////// @@ -702,30 +701,3 @@ to query rank information and utilize that to select a GPU. MPI internal CUDA resources are released during MPI_Finalize. Thus it is an application error to call cudaDeviceReset before MPI_Finalize is called. - - -///////////////////////////////////////////////////////////////////////// - -How do I enable CUDA support in HCOLL collective component ----------------------------------------------------------- - -HCOLL component supports CUDA GPU buffers for the following -collectives: - -MPI_Allreduce -MPI_Bcast -MPI_Allgather -MPI_Ibarrier -MPI_Ibcast -MPI_Iallgather -MPI_Iallreduce - -To enable CUDA GPU buffer support in these collectives pass the -following environment variables via mpirun: - -.. code-block:: - - shell$ mpirun -x HCOLL_GPU_ENABLE=1 -x HCOLL_ENABLE_NBC=1 .. - -See `nVidia HCOLL documentation `_ -for more information. diff --git a/ompi/mca/coll/base/coll_base_allgather.c b/ompi/mca/coll/base/coll_base_allgather.c index d3e27884106..6d9bd6fcfc3 100644 --- a/ompi/mca/coll/base/coll_base_allgather.c +++ b/ompi/mca/coll/base/coll_base_allgather.c @@ -291,8 +291,8 @@ int ompi_coll_base_allgather_intra_sparbit(const void *sbuf, size_t scount, /* Since each process sends several non-contiguos blocks of data, each block sent (and therefore each send and recv call) needs a different tag. */ /* As base OpenMPI only provides one tag for allgather, we are forced to use a tag space from other components in the send and recv calls */ - MCA_PML_CALL(isend(tmpsend + (ptrdiff_t) send_disp * scount * rext, scount, rdtype, sendto, MCA_COLL_BASE_TAG_HCOLL_BASE - send_disp, MCA_PML_BASE_SEND_STANDARD, comm, requests + transfer_count)); - MCA_PML_CALL(irecv(tmprecv + (ptrdiff_t) recv_disp * rcount * rext, rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_HCOLL_BASE - recv_disp, comm, requests + data_expected - exclusion + transfer_count)); + MCA_PML_CALL(isend(tmpsend + (ptrdiff_t) send_disp * scount * rext, scount, rdtype, sendto, MCA_COLL_BASE_TAG_NEIGHBOR_BASE - send_disp, MCA_PML_BASE_SEND_STANDARD, comm, requests + transfer_count)); + MCA_PML_CALL(irecv(tmprecv + (ptrdiff_t) recv_disp * rcount * rext, rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_NEIGHBOR_BASE - recv_disp, comm, requests + data_expected - exclusion + transfer_count)); } ompi_request_wait_all(transfer_count * 2, requests, MPI_STATUSES_IGNORE); diff --git a/ompi/mca/coll/base/coll_base_allgatherv.c b/ompi/mca/coll/base/coll_base_allgatherv.c index 24cd84ec616..337e09f7c77 100644 --- a/ompi/mca/coll/base/coll_base_allgatherv.c +++ b/ompi/mca/coll/base/coll_base_allgatherv.c @@ -332,12 +332,12 @@ int ompi_coll_base_allgatherv_intra_sparbit(const void *sbuf, size_t scount, if(ompi_count_array_get(rcounts, send_disp) > 0) MCA_PML_CALL(isend(tmpsend + ompi_disp_array_get(rdispls, send_disp) * rext, ompi_count_array_get(rcounts, send_disp), rdtype, sendto, - MCA_COLL_BASE_TAG_HCOLL_BASE - send_disp, + MCA_COLL_BASE_TAG_NEIGHBOR_BASE - send_disp, MCA_PML_BASE_SEND_STANDARD, comm, requests + step_requests++)); if(ompi_count_array_get(rcounts, recv_disp) > 0) MCA_PML_CALL(irecv(tmprecv + ompi_disp_array_get(rdispls, recv_disp) * rext, ompi_count_array_get(rcounts, recv_disp), rdtype, recvfrom, - MCA_COLL_BASE_TAG_HCOLL_BASE - recv_disp, comm, + MCA_COLL_BASE_TAG_NEIGHBOR_BASE - recv_disp, comm, requests + step_requests++)); } ompi_request_wait_all(step_requests, requests, MPI_STATUSES_IGNORE); diff --git a/ompi/mca/coll/base/coll_tags.h b/ompi/mca/coll/base/coll_tags.h index 5dc19061b37..5d3da7eafe5 100644 --- a/ompi/mca/coll/base/coll_tags.h +++ b/ompi/mca/coll/base/coll_tags.h @@ -69,10 +69,8 @@ #define MCA_COLL_BASE_TAG_NONBLOCKING_END ((-1 * INT_MAX/2) + 1) #define MCA_COLL_BASE_TAG_NEIGHBOR_BASE (MCA_COLL_BASE_TAG_NONBLOCKING_END - 1) #define MCA_COLL_BASE_TAG_NEIGHBOR_END (MCA_COLL_BASE_TAG_NEIGHBOR_BASE - 1024) -#define MCA_COLL_BASE_TAG_HCOLL_BASE (-1 * INT_MAX/2) -#define MCA_COLL_BASE_TAG_HCOLL_END (-1 * INT_MAX) #define MCA_COLL_BASE_TAG_BASE MCA_COLL_BASE_TAG_BLOCKING_BASE -#define MCA_COLL_BASE_TAG_END MCA_COLL_BASE_TAG_HCOLL_END +#define MCA_COLL_BASE_TAG_END MCA_COLL_BASE_TAG_NEIGHBOR_END #endif /* MCA_COLL_BASE_TAGS_H */ diff --git a/ompi/mca/coll/hcoll/Makefile.am b/ompi/mca/coll/hcoll/Makefile.am deleted file mode 100644 index 37ec1c96c92..00000000000 --- a/ompi/mca/coll/hcoll/Makefile.am +++ /dev/null @@ -1,50 +0,0 @@ -# -*- shell-script -*- -# -# -# Copyright (c) 2011 Mellanox Technologies. All rights reserved. -# Copyright (c) 2015 Research Organization for Information Science -# and Technology (RIST). All rights reserved. -# Copyright (c) 2017 IBM Corporation. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# - -AM_CPPFLAGS = $(coll_hcoll_CPPFLAGS) - -coll_hcoll_sources = \ - coll_hcoll.h \ - coll_hcoll_debug.h \ - coll_hcoll_dtypes.h \ - coll_hcoll_module.c \ - coll_hcoll_component.c \ - coll_hcoll_rte.c \ - coll_hcoll_ops.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_ompi_coll_hcoll_DSO -component_noinst = -component_install = mca_coll_hcoll.la -else -component_noinst = libmca_coll_hcoll.la -component_install = -endif - -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_coll_hcoll_la_SOURCES = $(coll_hcoll_sources) -mca_coll_hcoll_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \ - $(coll_hcoll_LIBS) -mca_coll_hcoll_la_LDFLAGS = -module -avoid-version $(coll_hcoll_LDFLAGS) - -noinst_LTLIBRARIES = $(component_noinst) -libmca_coll_hcoll_la_SOURCES = $(coll_hcoll_sources) -libmca_coll_hcoll_la_LIBADD = $(coll_hcoll_LIBS) -libmca_coll_hcoll_la_LDFLAGS = -module -avoid-version $(coll_hcoll_LDFLAGS) - diff --git a/ompi/mca/coll/hcoll/coll_hcoll.h b/ompi/mca/coll/hcoll/coll_hcoll.h deleted file mode 100644 index 07460aeb3f8..00000000000 --- a/ompi/mca/coll/hcoll/coll_hcoll.h +++ /dev/null @@ -1,342 +0,0 @@ -/** - Copyright (c) 2011 Mellanox Technologies. All rights reserved. - Copyright (c) 2015-2019 Research Organization for Information Science - and Technology (RIST). All rights reserved. - $COPYRIGHT$ - - Additional copyrights may follow - - $HEADER$ - */ - -#ifndef MCA_COLL_FCA_H -#define MCA_COLL_FCA_H - -#include "ompi_config.h" - -#include "mpi.h" -#include "ompi/runtime/ompi_rte.h" -#include "ompi/mca/mca.h" -#include "opal/memoryhooks/memory.h" -#include "opal/mca/memory/base/base.h" -#include "ompi/mca/coll/coll.h" -#include "ompi/request/request.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/mca/coll/base/coll_tags.h" -#include "ompi/communicator/communicator.h" -#include "ompi/attribute/attribute.h" -#include "ompi/op/op.h" - -#include "hcoll/api/hcoll_api.h" -#include "hcoll/api/hcoll_constants.h" - - -#include "coll_hcoll_debug.h" -#ifndef HCOLL_VERSION -#define HCOLL_VERSION(major, minor) (((major)<= HCOLL_VERSION(3,2) - /* hcoll init options */ - hcoll_init_opts_t *init_opts; -#endif - - /* FCA global stuff */ - mca_coll_hcoll_ops_t hcoll_ops; - opal_free_list_t requests; - opal_free_list_t dtypes; - int derived_types_support_enabled; -}; -typedef struct mca_coll_hcoll_component_t mca_coll_hcoll_component_t; - -OMPI_DECLSPEC extern mca_coll_hcoll_component_t mca_coll_hcoll_component; - - - - -/** - * FCA enabled communicator - */ -struct mca_coll_hcoll_module_t { - mca_coll_base_module_t super; - - ompi_communicator_t *comm; - int rank; - void *hcoll_context; - /* Saved handlers - for fallback */ - mca_coll_base_module_reduce_fn_t previous_reduce; - mca_coll_base_module_t *previous_reduce_module; - mca_coll_base_module_allreduce_fn_t previous_allreduce; - mca_coll_base_module_t *previous_allreduce_module; - mca_coll_base_module_bcast_fn_t previous_bcast; - mca_coll_base_module_t *previous_bcast_module; - mca_coll_base_module_barrier_fn_t previous_barrier; - mca_coll_base_module_t *previous_barrier_module; - mca_coll_base_module_allgather_fn_t previous_allgather; - mca_coll_base_module_t *previous_allgather_module; - mca_coll_base_module_allgatherv_fn_t previous_allgatherv; - mca_coll_base_module_t *previous_allgatherv_module; - mca_coll_base_module_alltoall_fn_t previous_alltoall; - mca_coll_base_module_t *previous_alltoall_module; - mca_coll_base_module_alltoallv_fn_t previous_alltoallv; - mca_coll_base_module_t *previous_alltoallv_module; - mca_coll_base_module_alltoallw_fn_t previous_alltoallw; - mca_coll_base_module_t *previous_alltoallw_module; - mca_coll_base_module_gather_fn_t previous_gather; - mca_coll_base_module_t *previous_gather_module; - mca_coll_base_module_gatherv_fn_t previous_gatherv; - mca_coll_base_module_t *previous_gatherv_module; - mca_coll_base_module_scatterv_fn_t previous_scatterv; - mca_coll_base_module_t *previous_scatterv_module; - mca_coll_base_module_reduce_scatter_fn_t previous_reduce_scatter; - mca_coll_base_module_t *previous_reduce_scatter_module; - mca_coll_base_module_reduce_scatter_block_fn_t previous_reduce_scatter_block; - mca_coll_base_module_t *previous_reduce_scatter_block_module; - mca_coll_base_module_ibcast_fn_t previous_ibcast; - mca_coll_base_module_t *previous_ibcast_module; - mca_coll_base_module_ibarrier_fn_t previous_ibarrier; - mca_coll_base_module_t *previous_ibarrier_module; - mca_coll_base_module_iallgather_fn_t previous_iallgather; - mca_coll_base_module_t *previous_iallgather_module; - mca_coll_base_module_iallgatherv_fn_t previous_iallgatherv; - mca_coll_base_module_t *previous_iallgatherv_module; - mca_coll_base_module_iallreduce_fn_t previous_iallreduce; - mca_coll_base_module_t *previous_iallreduce_module; - mca_coll_base_module_ireduce_fn_t previous_ireduce; - mca_coll_base_module_t *previous_ireduce_module; - mca_coll_base_module_igatherv_fn_t previous_igatherv; - mca_coll_base_module_t *previous_igatherv_module; - mca_coll_base_module_ialltoall_fn_t previous_ialltoall; - mca_coll_base_module_t *previous_ialltoall_module; - mca_coll_base_module_ialltoallv_fn_t previous_ialltoallv; - mca_coll_base_module_t *previous_ialltoallv_module; -}; -typedef struct mca_coll_hcoll_module_t mca_coll_hcoll_module_t; - -OBJ_CLASS_DECLARATION(mca_coll_hcoll_module_t); - - - - -/* API functions */ -int mca_coll_hcoll_init_query(bool enable_progress_threads, bool enable_mpi_threads); -mca_coll_base_module_t *mca_coll_hcoll_comm_query(struct ompi_communicator_t *comm, int *priority); -int mca_coll_hcoll_get_lib(void); -void hcoll_rte_fns_setup(void); - - -int mca_coll_hcoll_barrier(struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_bcast(void *buff, size_t count, - struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_allgather(const void *sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void *rbuf, size_t rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_allgatherv(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, ompi_count_array_t rcount, - ompi_disp_array_t displs, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_gather(const void *sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void *rbuf, size_t rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_allreduce(const void *sbuf, void *rbuf, size_t count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -#if HCOLL_API > HCOLL_VERSION(4,5) -int mca_coll_hcoll_reduce_scatter_block(const void *sbuf, void *rbuf, size_t rcount, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -int mca_coll_hcoll_reduce_scatter(const void *sbuf, void *rbuf, ompi_count_array_t rcounts, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -#endif -int mca_coll_hcoll_reduce(const void *sbuf, void *rbuf, size_t count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_alltoall(const void *sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void* rbuf, size_t rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_alltoallv(const void *sbuf, ompi_count_array_t scounts, - ompi_disp_array_t sdisps, - struct ompi_datatype_t *sdtype, - void *rbuf, ompi_count_array_t rcounts, - ompi_disp_array_t rdisps, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_gatherv(const void* sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void* rbuf, ompi_count_array_t rcounts, ompi_disp_array_t displs, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - - -int mca_coll_hcoll_scatterv(const void* sbuf, ompi_count_array_t scounts, ompi_disp_array_t displs, - struct ompi_datatype_t *sdtype, - void* rbuf, size_t rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_ibarrier(struct ompi_communicator_t *comm, - ompi_request_t** request, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_ibcast(void *buff, size_t count, - struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, - ompi_request_t** request, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_iallgather(const void *sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void *rbuf, size_t rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - ompi_request_t** request, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_iallgatherv(const void *sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void *rbuf, ompi_count_array_t rcount, - ompi_disp_array_t displs, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - ompi_request_t** request, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_iallreduce(const void *sbuf, void *rbuf, size_t count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - ompi_request_t** request, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_ireduce(const void *sbuf, void *rbuf, size_t count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - ompi_request_t** request, - mca_coll_base_module_t *module); - -int mca_coll_hcoll_ialltoall(const void *sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void* rbuf, size_t rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module); - -#if HCOLL_API >= HCOLL_VERSION(3,7) -int mca_coll_hcoll_ialltoallv(const void *sbuf, ompi_count_array_t scounts, - ompi_disp_array_t sdisps, - struct ompi_datatype_t *sdtype, - void *rbuf, ompi_count_array_t rcounts, - ompi_disp_array_t rdisps, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module); -#endif - -int mca_coll_hcoll_igatherv(const void* sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void* rbuf, ompi_count_array_t rcounts, ompi_disp_array_t displs, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t *module); - -void mca_coll_hcoll_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc); -END_C_DECLS - -#endif diff --git a/ompi/mca/coll/hcoll/coll_hcoll_component.c b/ompi/mca/coll/hcoll/coll_hcoll_component.c deleted file mode 100644 index 2c276cf79a4..00000000000 --- a/ompi/mca/coll/hcoll/coll_hcoll_component.c +++ /dev/null @@ -1,251 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011 Mellanox Technologies. All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -#include "ompi_config.h" -#include - -#include -#include - -#include "coll_hcoll.h" -#include "opal/mca/installdirs/installdirs.h" -#include "coll_hcoll_dtypes.h" - -/* - * Public string showing the coll ompi_hcol component version number - */ -const char *mca_coll_hcoll_component_version_string = - "Open MPI HCOL collective MCA component version " OMPI_VERSION; - - -static int hcoll_open(void); -static int hcoll_close(void); -static int hcoll_register(void); -int mca_coll_hcoll_output = -1; -mca_coll_hcoll_component_t mca_coll_hcoll_component = { - - /* First, the mca_component_t struct containing meta information - about the component itfca */ - { - .collm_version = { - MCA_COLL_BASE_VERSION_3_0_0, - - /* Component name and version */ - .mca_component_name = "hcoll", - MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION), - - /* Component open and close functions */ - .mca_open_component = hcoll_open, - .mca_close_component = hcoll_close, - .mca_register_component_params = hcoll_register, - }, - .collm_data = { - /* The component is not checkpoint ready */ - MCA_BASE_METADATA_PARAM_NONE - }, - - /* Initialization / querying functions */ - - .collm_init_query = mca_coll_hcoll_init_query, - .collm_comm_query = mca_coll_hcoll_comm_query, - }, - 90, /* priority */ - 0, /* verbose level */ - 0, /* hcoll_enable */ - NULL /*hcoll version */ -}; -MCA_BASE_COMPONENT_INIT(ompi, coll, hcoll) - - - - -int mca_coll_hcoll_get_lib(void) -{ - - memset(&mca_coll_hcoll_component.hcoll_ops, - 0, sizeof(mca_coll_hcoll_component.hcoll_ops)); - - return OMPI_SUCCESS; -} - -/* - * * Local flags - * */ -enum { - REGINT_NEG_ONE_OK = 0x01, - REGINT_GE_ZERO = 0x02, - REGINT_GE_ONE = 0x04, - REGINT_NONZERO = 0x08, - REGINT_MAX = 0x88 -}; - -enum { - REGSTR_EMPTY_OK = 0x01, - REGSTR_MAX = 0x88 -}; - - -/* - * Utility routine for integer parameter registration - */ -static int reg_int(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - int default_value, int *storage, int flags) -{ - int index; - - *storage = default_value; - index = mca_base_component_var_register( - &mca_coll_hcoll_component.super.collm_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_INT, - NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_ALL, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, - "ompi", "coll", "hcoll", deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - if (0 != (flags & REGINT_NEG_ONE_OK) && -1 == *storage) { - return OMPI_SUCCESS; - } - - if ((0 != (flags & REGINT_GE_ZERO) && *storage < 0) || - (0 != (flags & REGINT_GE_ONE) && *storage < 1) || - (0 != (flags & REGINT_NONZERO) && 0 == *storage)) { - opal_output(0, "Bad parameter value for parameter \"%s\"", - param_name); - return OMPI_ERR_BAD_PARAM; - } - - return OMPI_SUCCESS; -} - - -static int hcoll_register(void) -{ - - int ret, tmp; - - ret = OMPI_SUCCESS; - -#define CHECK(expr) do { \ - tmp = (expr); \ - if (OMPI_SUCCESS != tmp) ret = tmp; \ - } while (0) - - - CHECK(reg_int("priority",NULL, - "Priority of the hcol coll component", - 90, - &mca_coll_hcoll_component.hcoll_priority, - 0)); - - CHECK(reg_int("verbose", NULL, - "Verbose level of the hcol coll component", - 0, - &mca_coll_hcoll_component.hcoll_verbose, - 0)); - - CHECK(reg_int("enable",NULL, - "[1|0|] Enable/Disable HCOL", - 1, - &mca_coll_hcoll_component.hcoll_enable, - 0)); - - CHECK(reg_int("np",NULL, - "Minimal number of processes in the communicator" - " for the corresponding hcoll context to be created (default: 32)", - 2, - &mca_coll_hcoll_component.hcoll_np, - 0)); - - CHECK(reg_int("datatype_fallback",NULL, - "[1|0|] Enable/Disable user defined datatypes fallback", - 1, - &mca_coll_hcoll_component.hcoll_datatype_fallback, - 0)); -#if HCOLL_API >= HCOLL_VERSION(3,6) - CHECK(reg_int("dts",NULL, - "[1|0|] Enable/Disable derived types support", - 1, - &mca_coll_hcoll_component.derived_types_support_enabled, - 0)); -#else - mca_coll_hcoll_component.derived_types_support_enabled = 0; -#endif - mca_coll_hcoll_component.compiletime_version = HCOLL_VERNO_STRING; - mca_base_component_var_register(&mca_coll_hcoll_component.super.collm_version, - MCA_COMPILETIME_VER, - "Version of the libhcoll library with which Open MPI was compiled", - MCA_BASE_VAR_TYPE_VERSION_STRING, - NULL, 0, 0, - OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_coll_hcoll_component.compiletime_version); - mca_coll_hcoll_component.runtime_version = hcoll_get_version(); - mca_base_component_var_register(&mca_coll_hcoll_component.super.collm_version, - MCA_RUNTIME_VER, - "Version of the libhcoll library with which Open MPI is running", - MCA_BASE_VAR_TYPE_VERSION_STRING, - NULL, 0, 0, - OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_coll_hcoll_component.runtime_version); - - return ret; -} - -static int hcoll_open(void) -{ - mca_coll_hcoll_component_t *cm; - cm = &mca_coll_hcoll_component; - mca_coll_hcoll_output = opal_output_open(NULL); - opal_output_set_verbosity(mca_coll_hcoll_output, cm->hcoll_verbose); - hcoll_rte_fns_setup(); - cm->libhcoll_initialized = false; - return OMPI_SUCCESS; -} - -static int hcoll_close(void) -{ - int rc; - mca_coll_hcoll_component_t *cm; - cm = &mca_coll_hcoll_component; - - if (false == cm->libhcoll_initialized) { - return OMPI_SUCCESS; - } - - if (cm->using_mem_hooks) { - opal_mem_hooks_unregister_release(mca_coll_hcoll_mem_release_cb); - } - -#if HCOLL_API >= HCOLL_VERSION(3,2) - hcoll_free_init_opts(cm->init_opts); -#endif - - HCOL_VERBOSE(5,"HCOLL FINALIZE"); - rc = hcoll_finalize(); - OBJ_DESTRUCT(&cm->dtypes); - opal_progress_unregister(hcoll_progress_fn); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(1,"Hcol library finalize failed"); - return OMPI_ERROR; - } - - mca_base_framework_close(&opal_memory_base_framework); - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/coll/hcoll/coll_hcoll_debug.h b/ompi/mca/coll/hcoll/coll_hcoll_debug.h deleted file mode 100644 index 3897399595b..00000000000 --- a/ompi/mca/coll/hcoll/coll_hcoll_debug.h +++ /dev/null @@ -1,35 +0,0 @@ -/** - Copyright (c) 2011 Mellanox Technologies. All rights reserved. - $COPYRIGHT$ - - Additional copyrights may follow - - $HEADER$ - */ - -#ifndef COLL_HCOL_DEBUG_H -#define COLL_HCOL_DEBUG_H -#include "ompi_config.h" -#pragma GCC system_header - -#ifdef __BASE_FILE__ -#define __HCOL_FILE__ __BASE_FILE__ -#else -#define __HCOL_FILE__ __FILE__ -#endif - -#define HCOL_VERBOSE(level, format, ...) \ - opal_output_verbose(level, mca_coll_hcoll_output, "%s:%d - %s() " format, \ - __HCOL_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) - -#define HCOL_ERROR(format, ... ) \ - opal_output_verbose(0, mca_coll_hcoll_output, "Error: %s:%d - %s() " format, \ - __HCOL_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) - - -#define HCOL_MODULE_VERBOSE(hcoll_module, level, format, ...) \ - HCOL_VERBOSE(level, "[%p:%d] " format, (void*)(hcoll_module)->comm, (hcoll_module)->rank, ## __VA_ARGS__) - -extern int mca_coll_hcoll_output; - -#endif // COLL_HCOL_DEBUG_H diff --git a/ompi/mca/coll/hcoll/coll_hcoll_dtypes.h b/ompi/mca/coll/hcoll/coll_hcoll_dtypes.h deleted file mode 100644 index 481ff855e58..00000000000 --- a/ompi/mca/coll/hcoll/coll_hcoll_dtypes.h +++ /dev/null @@ -1,281 +0,0 @@ -#ifndef COLL_HCOLL_DTYPES_H -#define COLL_HCOLL_DTYPES_H - -/*Here comes the translation between ompi_datatype_t and dte_data_representation - This is not complete and takes into account the basic datatypes only - It is used to extract allreduce bcol functions where the arrhythmetics has to be done*/ - -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/datatype/ompi_datatype_internal.h" -#include "ompi/mca/op/op.h" -#include "hcoll/api/hcoll_dte.h" -extern int hcoll_type_attr_keyval; -extern mca_coll_hcoll_dtype_t zero_dte_mapping; -/*to keep this at hand: Ids of the basic opal_datatypes: -#define OPAL_DATATYPE_INT1 4 -#define OPAL_DATATYPE_INT2 5 -#define OPAL_DATATYPE_INT4 6 -#define OPAL_DATATYPE_INT8 7 -#define OPAL_DATATYPE_INT16 8 -#define OPAL_DATATYPE_UINT1 9 -#define OPAL_DATATYPE_UINT2 10 -#define OPAL_DATATYPE_UINT4 11 -#define OPAL_DATATYPE_UINT8 12 -#define OPAL_DATATYPE_UINT16 13 -#define OPAL_DATATYPE_FLOAT2 14 -#define OPAL_DATATYPE_FLOAT4 15 -#define OPAL_DATATYPE_FLOAT8 16 -#define OPAL_DATATYPE_FLOAT12 17 -#define OPAL_DATATYPE_FLOAT16 18 -#define OPAL_DATATYPE_SHORT_FLOAT_COMPLEX 19 -#define OPAL_DATATYPE_FLOAT_COMPLEX 20 -#define OPAL_DATATYPE_DOUBLE_COMPLEX 21 - -total 15 types -*/ - -static dte_data_representation_t* ompi_datatype_2_dte_data_rep[OPAL_DATATYPE_MAX_PREDEFINED] = { - &DTE_ZERO, /*OPAL_DATATYPE_LOOP 0 */ - &DTE_ZERO, /*OPAL_DATATYPE_END_LOOP 1 */ -#if defined(DTE_LB) - &DTE_LB, /*OPAL_DATATYPE_LB 2 */ -#else - &DTE_ZERO, -#endif -#if defined(DTE_UB) - &DTE_UB, /*OPAL_DATATYPE_UB 3 */ -#else - &DTE_ZERO, -#endif - &DTE_BYTE, /*OPAL_DATATYPE_INT1 4 */ - &DTE_INT16, /*OPAL_DATATYPE_INT2 5 */ - &DTE_INT32, /*OPAL_DATATYPE_INT4 6 */ - &DTE_INT64, /*OPAL_DATATYPE_INT8 7 */ - &DTE_INT128, /*OPAL_DATATYPE_INT16 8 */ - &DTE_UBYTE, /*OPAL_DATATYPE_UINT1 9 */ - &DTE_UINT16, /*OPAL_DATATYPE_UINT2 10 */ - &DTE_UINT32, /*OPAL_DATATYPE_UINT4 11 */ - &DTE_UINT64, /*OPAL_DATATYPE_UINT8 12 */ - &DTE_UINT128, /*OPAL_DATATYPE_UINT16 13 */ -#if defined(DTE_FLOAT16) - &DTE_FLOAT16, /*OPAL_DATATYPE_FLOAT2 14 */ -#else - &DTE_ZERO, -#endif - &DTE_FLOAT32, /*OPAL_DATATYPE_FLOAT4 15 */ - &DTE_FLOAT64, /*OPAL_DATATYPE_FLOAT8 16 */ - &DTE_FLOAT96, /*OPAL_DATATYPE_FLOAT12 17 */ - &DTE_FLOAT128, /*OPAL_DATATYPE_FLOAT16 18 */ - &DTE_ZERO, /*OPAL_DATATYPE_SHORT_FLOAT_COMPLEX 19 */ -#if defined(DTE_FLOAT32_COMPLEX) - &DTE_FLOAT32_COMPLEX, /*OPAL_DATATYPE_FLOAT_COMPLEX 20 */ -#else - &DTE_ZERO, -#endif -#if defined(DTE_FLOAT64_COMPLEX) - &DTE_FLOAT64_COMPLEX, /*OPAL_DATATYPE_DOUBLE_COMPLEX 21 */ -#else - &DTE_ZERO, -#endif -#if defined(DTE_FLOAT128_COMPLEX) - &DTE_FLOAT128_COMPLEX, /*OPAL_DATATYPE_LONG_DOUBLE_COMPLEX 22 */ -#else - &DTE_ZERO, -#endif -#if defined(DTE_BOOL) - &DTE_BOOL, /*OPAL_DATATYPE_BOOL 23 */ -#else - &DTE_ZERO, -#endif -#if defined(DTE_WCHAR) - &DTE_WCHAR, /*OPAL_DATATYPE_WCHAR 24 */ -#else - &DTE_ZERO, -#endif -#if SIZEOF_LONG == 4 - &DTE_INT32, -#else - &DTE_INT64, /*OPAL_DATATYPE_LONG 25 */ -#endif -#if SIZEOF_UNSIGNED_LONG == 4 - &DTE_UINT32, -#else - &DTE_UINT64, /*OPAL_DATATYPE_UNSIGNED_LONG 26 */ -#endif - &DTE_ZERO /*OPAL_DATATYPE_UNAVAILABLE 27 */ -}; - -enum { - TRY_FIND_DERIVED, - NO_DERIVED -}; - - -#if HCOLL_API >= HCOLL_VERSION(3,6) -static inline -void hcoll_map_derived_type(ompi_datatype_t *dtype, dte_data_representation_t *new_dte) -{ - int rc; - if (NULL == dtype->args) { - /* predefined type, shouldn't call this */ - return; - } - rc = hcoll_create_mpi_type((void*)dtype, new_dte); - if (rc != HCOLL_SUCCESS) { - /* If hcoll fails to create mpi derived type let's set zero_dte on this dtype. - This will save cycles on subsequent collective calls with the same derived - type since we will not try to create hcoll type again. */ - ompi_attr_set_c(TYPE_ATTR, (void*)dtype, &(dtype->d_keyhash), - hcoll_type_attr_keyval, &zero_dte_mapping, false); - } -} - -static dte_data_representation_t find_derived_mapping(ompi_datatype_t *dtype){ - dte_data_representation_t dte = DTE_ZERO; - mca_coll_hcoll_dtype_t *hcoll_dtype; - if (mca_coll_hcoll_component.derived_types_support_enabled) { - int map_found = 0; - ompi_attr_get_c(dtype->d_keyhash, hcoll_type_attr_keyval, - (void**)&hcoll_dtype, &map_found); - if (!map_found) - hcoll_map_derived_type(dtype, &dte); - else - dte = hcoll_dtype->type; - } - - return dte; -} - - - -static inline dte_data_representation_t -ompi_predefined_derived_2_hcoll(int ompi_id) { - switch(ompi_id) { - case OMPI_DATATYPE_MPI_FLOAT_INT: - return DTE_FLOAT_INT; - case OMPI_DATATYPE_MPI_DOUBLE_INT: - return DTE_DOUBLE_INT; - case OMPI_DATATYPE_MPI_LONG_INT: - return DTE_LONG_INT; - case OMPI_DATATYPE_MPI_SHORT_INT: - return DTE_SHORT_INT; - case OMPI_DATATYPE_MPI_LONG_DOUBLE_INT: - return DTE_LONG_DOUBLE_INT; - case OMPI_DATATYPE_MPI_2INT: - return DTE_2INT; -#if HCOLL_API >= HCOLL_VERSION(3,7) - case OMPI_DATATYPE_MPI_2INTEGER: -#if OMPI_SIZEOF_FORTRAN_INTEGER == 4 - return DTE_2INT; -#elif OMPI_SIZEOF_FORTRAN_INTEGER == 8 - return DTE_2INT64; -#else - return DTE_ZERO; -#endif - case OMPI_DATATYPE_MPI_2REAL: -#if OMPI_SIZEOF_FORTRAN_REAL == 4 - return DTE_2FLOAT32; -#elif OMPI_SIZEOF_FORTRAN_REAL == 8 - return DTE_2FLOAT64; -#else - return DTE_ZERO; -#endif - case OMPI_DATATYPE_MPI_2DBLPREC: -#if OMPI_SIZEOF_FORTRAN_DOUBLE_PRECISION == 4 - return DTE_2FLOAT32; -#elif OMPI_SIZEOF_FORTRAN_DOUBLE_PRECISION == 8 - return DTE_2FLOAT64; -#else - return DTE_ZERO; -#endif -#endif - default: - break; - } - return DTE_ZERO; -} -#endif - -static dte_data_representation_t -ompi_dtype_2_hcoll_dtype( ompi_datatype_t *dtype, - const int mode) -{ - int ompi_type_id = dtype->id; - int opal_type_id = dtype->super.id; - dte_data_representation_t dte_data_rep = DTE_ZERO; - - if (ompi_type_id < OMPI_DATATYPE_MPI_MAX_PREDEFINED && - dtype->super.flags & OMPI_DATATYPE_FLAG_PREDEFINED) { - if (opal_type_id > 0 && opal_type_id < OPAL_DATATYPE_MAX_PREDEFINED) { - dte_data_rep = *ompi_datatype_2_dte_data_rep[opal_type_id]; - } -#if HCOLL_API >= HCOLL_VERSION(3,6) - else if (TRY_FIND_DERIVED == mode){ - dte_data_rep = ompi_predefined_derived_2_hcoll(ompi_type_id); - } - } else { - if (TRY_FIND_DERIVED == mode) - dte_data_rep = find_derived_mapping(dtype); -#endif - } - if (HCOL_DTE_IS_ZERO(dte_data_rep) && TRY_FIND_DERIVED == mode && - !mca_coll_hcoll_component.hcoll_datatype_fallback) { - dte_data_rep = DTE_ZERO; - dte_data_rep.rep.in_line_rep.data_handle.in_line.in_line = 0; - dte_data_rep.rep.in_line_rep.data_handle.pointer_to_handle = (uint64_t ) &dtype->super; - } - return dte_data_rep; -} - -static hcoll_dte_op_t* ompi_op_2_hcoll_op[OMPI_OP_BASE_FORTRAN_OP_MAX + 1] = { - &hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_NULL = 0 */ - &hcoll_dte_op_max, /* OMPI_OP_BASE_FORTRAN_MAX */ - &hcoll_dte_op_min, /* OMPI_OP_BASE_FORTRAN_MIN */ - &hcoll_dte_op_sum, /* OMPI_OP_BASE_FORTRAN_SUM */ - &hcoll_dte_op_prod, /* OMPI_OP_BASE_FORTRAN_PROD */ - &hcoll_dte_op_land, /* OMPI_OP_BASE_FORTRAN_LAND */ - &hcoll_dte_op_band, /* OMPI_OP_BASE_FORTRAN_BAND */ - &hcoll_dte_op_lor, /* OMPI_OP_BASE_FORTRAN_LOR */ - &hcoll_dte_op_bor, /* OMPI_OP_BASE_FORTRAN_BOR */ - &hcoll_dte_op_lxor, /* OMPI_OP_BASE_FORTRAN_LXOR */ - &hcoll_dte_op_bxor, /* OMPI_OP_BASE_FORTRAN_BXOR */ - &hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_MAXLOC */ - &hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_MINLOC */ - &hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_REPLACE */ - &hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_NO_OP */ - &hcoll_dte_op_null /* OMPI_OP_BASE_FORTRAN_OP_MAX */ -}; -static hcoll_dte_op_t* ompi_op_2_hcolrte_op(ompi_op_t *op) { - if (op->o_f_to_c_index > OMPI_OP_BASE_FORTRAN_OP_MAX) { - return ompi_op_2_hcoll_op[0]; /* return null */ - } - return ompi_op_2_hcoll_op[op->o_f_to_c_index]; -} - - -#if HCOLL_API >= HCOLL_VERSION(3,6) -static int hcoll_type_attr_del_fn(MPI_Datatype type, int keyval, void *attr_val, void *extra) { - int ret = OMPI_SUCCESS; - mca_coll_hcoll_dtype_t *dtype = - (mca_coll_hcoll_dtype_t*) attr_val; - - assert(dtype); - if (&zero_dte_mapping == dtype) { - return OMPI_SUCCESS; - } - if (HCOLL_SUCCESS != (ret = hcoll_dt_destroy(dtype->type))) { - HCOL_ERROR("failed to delete type attr: hcoll_dte_destroy returned %d",ret); - return OMPI_ERROR; - } - opal_free_list_return(&mca_coll_hcoll_component.dtypes, - &dtype->super); - - return OMPI_SUCCESS; -} -#else -static int hcoll_type_attr_del_fn(MPI_Datatype type, int keyval, void *attr_val, void *extra) { - /*Do nothing - it's an old version of hcoll w/o dtypes support */ - return OMPI_SUCCESS; -} -#endif -#endif /* COLL_HCOLL_DTYPES_H */ diff --git a/ompi/mca/coll/hcoll/coll_hcoll_module.c b/ompi/mca/coll/hcoll/coll_hcoll_module.c deleted file mode 100644 index 5ca588a8154..00000000000 --- a/ompi/mca/coll/hcoll/coll_hcoll_module.c +++ /dev/null @@ -1,478 +0,0 @@ -/** - * Copyright (c) 2011 Mellanox Technologies. All rights reserved. - * Copyright (c) 2016-2022 IBM Corporation. All rights reserved. - * Copyright (c) 2017 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2018 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. - * All Rights reserved. - * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "coll_hcoll.h" -#include "coll_hcoll_dtypes.h" - -int hcoll_comm_attr_keyval; -int hcoll_type_attr_keyval; -mca_coll_hcoll_dtype_t zero_dte_mapping; -/* - * Initial query function that is invoked during MPI_INIT, allowing - * this module to indicate what level of thread support it provides. - */ -int mca_coll_hcoll_init_query(bool enable_progress_threads, bool enable_mpi_threads) -{ -#if HCOLL_API < HCOLL_VERSION(3,2) - if (enable_mpi_threads) { - HCOL_VERBOSE(1, "MPI_THREAD_MULTIPLE not supported; skipping hcoll component"); - return OMPI_ERROR; - } -#endif - return OMPI_SUCCESS; -} - -static void mca_coll_hcoll_module_clear(mca_coll_hcoll_module_t *hcoll_module) -{ - hcoll_module->hcoll_context = NULL; - hcoll_module->previous_barrier = NULL; - hcoll_module->previous_bcast = NULL; - hcoll_module->previous_reduce = NULL; - hcoll_module->previous_allreduce = NULL; - hcoll_module->previous_allgather = NULL; - hcoll_module->previous_allgatherv = NULL; - hcoll_module->previous_gather = NULL; - hcoll_module->previous_gatherv = NULL; - hcoll_module->previous_scatterv = NULL; - hcoll_module->previous_alltoall = NULL; - hcoll_module->previous_alltoallv = NULL; - hcoll_module->previous_alltoallw = NULL; - hcoll_module->previous_reduce = NULL; - hcoll_module->previous_reduce_scatter = NULL; - hcoll_module->previous_reduce_scatter_block = NULL; - hcoll_module->previous_ibarrier = NULL; - hcoll_module->previous_ibcast = NULL; - hcoll_module->previous_iallreduce = NULL; - hcoll_module->previous_iallgather = NULL; - hcoll_module->previous_iallgatherv = NULL; - hcoll_module->previous_igatherv = NULL; - hcoll_module->previous_ireduce = NULL; - hcoll_module->previous_ialltoall = NULL; - hcoll_module->previous_ialltoallv = NULL; - - hcoll_module->previous_barrier_module = NULL; - hcoll_module->previous_bcast_module = NULL; - hcoll_module->previous_allreduce_module = NULL; - hcoll_module->previous_reduce_module = NULL; - hcoll_module->previous_allgather_module = NULL; - hcoll_module->previous_allgatherv_module = NULL; - hcoll_module->previous_gather_module = NULL; - hcoll_module->previous_gatherv_module = NULL; - hcoll_module->previous_scatterv_module = NULL; - hcoll_module->previous_alltoall_module = NULL; - hcoll_module->previous_alltoallv_module = NULL; - hcoll_module->previous_alltoallw_module = NULL; - hcoll_module->previous_reduce_scatter_module = NULL; - hcoll_module->previous_ibarrier_module = NULL; - hcoll_module->previous_ibcast_module = NULL; - hcoll_module->previous_iallreduce_module = NULL; - hcoll_module->previous_ireduce_module = NULL; - hcoll_module->previous_iallgather_module = NULL; - hcoll_module->previous_iallgatherv_module = NULL; - hcoll_module->previous_igatherv_module = NULL; - hcoll_module->previous_ialltoall_module = NULL; - hcoll_module->previous_ialltoallv_module = NULL; -} - -static void mca_coll_hcoll_module_construct(mca_coll_hcoll_module_t *hcoll_module) -{ - mca_coll_hcoll_module_clear(hcoll_module); -} - -void mca_coll_hcoll_mem_release_cb(void *buf, size_t length, - void *cbdata, bool from_alloc) -{ - hcoll_mem_unmap(buf, length, cbdata, from_alloc); -} - -static void mca_coll_hcoll_module_destruct(mca_coll_hcoll_module_t *hcoll_module) -{ - int context_destroyed; - - if (hcoll_module->comm == &ompi_mpi_comm_world.comm){ - if (OMPI_SUCCESS != ompi_attr_free_keyval(COMM_ATTR, &hcoll_comm_attr_keyval, 0)) { - HCOL_VERBOSE(1,"hcoll ompi_attr_free_keyval failed"); - } - } - - /* If the hcoll_context is null then we are destroying the hcoll_module - that didn't initialized fallback colls/modules. - Then just clear and return. Otherwise release module pointers and - destroy hcoll context*/ - - if (hcoll_module->hcoll_context != NULL){ - -#if !defined(HAVE_HCOLL_CONTEXT_FREE) - context_destroyed = 0; - hcoll_destroy_context(hcoll_module->hcoll_context, - (rte_grp_handle_t)hcoll_module->comm, - &context_destroyed); -#endif - } - mca_coll_hcoll_module_clear(hcoll_module); -} - -#define HCOL_INSTALL_COLL_API(__comm, __module, __api) \ - do \ - { \ - if (NULL != __module->super.coll_##__api) \ - { \ - if (comm->c_coll->coll_##__api && !comm->c_coll->coll_##__api##_module) \ - { \ - /* save the current selected collective */ \ - MCA_COLL_SAVE_API(__comm, __api, hcoll_module->previous_##__api, hcoll_module->previous_##__api##_module, "hcoll"); \ - /* install our own */ \ - MCA_COLL_INSTALL_API(__comm, __api, __module->super.coll_##__api, &__module->super, "hcoll"); \ - } \ - } \ - } while (0) - -#define HCOL_UNINSTALL_COLL_API(__comm, __module, __api) \ - do \ - { \ - if (&__module->super == comm->c_coll->coll_##__api##_module) \ - { \ - MCA_COLL_INSTALL_API(__comm, __api, __module->previous_##__api, __module->previous_##__api##_module, "hcoll"); \ - hcoll_module->previous_##__api = NULL; \ - hcoll_module->previous_##__api##_module = NULL; \ - } \ - } while (0) - -static int mca_coll_hcoll_save_coll_handlers(mca_coll_hcoll_module_t *hcoll_module) -{ - ompi_communicator_t *comm; - comm = hcoll_module->comm; - - hcoll_module->super.coll_barrier = hcoll_collectives.coll_barrier ? mca_coll_hcoll_barrier : NULL; - hcoll_module->super.coll_bcast = hcoll_collectives.coll_bcast ? mca_coll_hcoll_bcast : NULL; - hcoll_module->super.coll_allgather = hcoll_collectives.coll_allgather ? mca_coll_hcoll_allgather : NULL; - hcoll_module->super.coll_allgatherv = hcoll_collectives.coll_allgatherv ? mca_coll_hcoll_allgatherv : NULL; - hcoll_module->super.coll_allreduce = hcoll_collectives.coll_allreduce ? mca_coll_hcoll_allreduce : NULL; - hcoll_module->super.coll_alltoall = hcoll_collectives.coll_alltoall ? mca_coll_hcoll_alltoall : NULL; - hcoll_module->super.coll_alltoallv = hcoll_collectives.coll_alltoallv ? mca_coll_hcoll_alltoallv : NULL; - hcoll_module->super.coll_gatherv = hcoll_collectives.coll_gatherv ? mca_coll_hcoll_gatherv : NULL; - hcoll_module->super.coll_scatterv = hcoll_collectives.coll_scatterv ? mca_coll_hcoll_scatterv : NULL; - hcoll_module->super.coll_reduce = hcoll_collectives.coll_reduce ? mca_coll_hcoll_reduce : NULL; - hcoll_module->super.coll_ibarrier = hcoll_collectives.coll_ibarrier ? mca_coll_hcoll_ibarrier : NULL; - hcoll_module->super.coll_ibcast = hcoll_collectives.coll_ibcast ? mca_coll_hcoll_ibcast : NULL; - hcoll_module->super.coll_iallgather = hcoll_collectives.coll_iallgather ? mca_coll_hcoll_iallgather : NULL; -#if HCOLL_API >= HCOLL_VERSION(3, 5) - hcoll_module->super.coll_iallgatherv = hcoll_collectives.coll_iallgatherv ? mca_coll_hcoll_iallgatherv : NULL; -#else - hcoll_module->super.coll_iallgatherv = NULL; -#endif - hcoll_module->super.coll_iallreduce = hcoll_collectives.coll_iallreduce ? mca_coll_hcoll_iallreduce : NULL; -#if HCOLL_API >= HCOLL_VERSION(3, 5) - hcoll_module->super.coll_ireduce = hcoll_collectives.coll_ireduce ? mca_coll_hcoll_ireduce : NULL; -#else - hcoll_module->super.coll_ireduce = NULL; -#endif - hcoll_module->super.coll_gather = /*hcoll_collectives.coll_gather ? mca_coll_hcoll_gather :*/ NULL; - hcoll_module->super.coll_igatherv = hcoll_collectives.coll_igatherv ? mca_coll_hcoll_igatherv : NULL; - hcoll_module->super.coll_ialltoall = /*hcoll_collectives.coll_ialltoall ? mca_coll_hcoll_ialltoall : */ NULL; -#if HCOLL_API >= HCOLL_VERSION(3, 7) - hcoll_module->super.coll_ialltoallv = hcoll_collectives.coll_ialltoallv ? mca_coll_hcoll_ialltoallv : NULL; -#else - hcoll_module->super.coll_ialltoallv = NULL; -#endif -#if HCOLL_API > HCOLL_VERSION(4, 5) - hcoll_module->super.coll_reduce_scatter_block = hcoll_collectives.coll_reduce_scatter_block ? mca_coll_hcoll_reduce_scatter_block : NULL; - hcoll_module->super.coll_reduce_scatter = hcoll_collectives.coll_reduce_scatter ? mca_coll_hcoll_reduce_scatter : NULL; -#endif - - HCOL_INSTALL_COLL_API(comm, hcoll_module, barrier); - HCOL_INSTALL_COLL_API(comm, hcoll_module, bcast); - HCOL_INSTALL_COLL_API(comm, hcoll_module, allreduce); - HCOL_INSTALL_COLL_API(comm, hcoll_module, reduce_scatter_block); - HCOL_INSTALL_COLL_API(comm, hcoll_module, reduce_scatter); - HCOL_INSTALL_COLL_API(comm, hcoll_module, reduce); - HCOL_INSTALL_COLL_API(comm, hcoll_module, allgather); - HCOL_INSTALL_COLL_API(comm, hcoll_module, allgatherv); - HCOL_INSTALL_COLL_API(comm, hcoll_module, gatherv); - HCOL_INSTALL_COLL_API(comm, hcoll_module, scatterv); - HCOL_INSTALL_COLL_API(comm, hcoll_module, alltoall); - HCOL_INSTALL_COLL_API(comm, hcoll_module, alltoallv); - - HCOL_INSTALL_COLL_API(comm, hcoll_module, ibarrier); - HCOL_INSTALL_COLL_API(comm, hcoll_module, ibcast); - HCOL_INSTALL_COLL_API(comm, hcoll_module, iallreduce); - HCOL_INSTALL_COLL_API(comm, hcoll_module, ireduce); - HCOL_INSTALL_COLL_API(comm, hcoll_module, iallgather); - HCOL_INSTALL_COLL_API(comm, hcoll_module, iallgatherv); - HCOL_INSTALL_COLL_API(comm, hcoll_module, igatherv); - HCOL_INSTALL_COLL_API(comm, hcoll_module, ialltoall); - HCOL_INSTALL_COLL_API(comm, hcoll_module, ialltoallv); - - /* - These collectives are not yet part of hcoll, so - don't retain them on hcoll module - HCOL_INSTALL_COLL_API(comm, hcoll_module, reduce_scatter); - HCOL_INSTALL_COLL_API(comm, hcoll_module, gather); - HCOL_INSTALL_COLL_API(comm, hcoll_module, reduce); - HCOL_INSTALL_COLL_API(comm, hcoll_module, allgatherv); - HCOL_INSTALL_COLL_API(comm, hcoll_module, alltoallw); - */ - return OMPI_SUCCESS; -} - - - -/* -** Communicator free callback -*/ -static int hcoll_comm_attr_del_fn(MPI_Comm comm, int keyval, void *attr_val, void *extra) -{ - - mca_coll_hcoll_module_t *hcoll_module; - hcoll_module = (mca_coll_hcoll_module_t*) attr_val; - -#ifdef HAVE_HCOLL_CONTEXT_FREE - hcoll_context_free(hcoll_module->hcoll_context, (rte_grp_handle_t)comm); -#else - hcoll_group_destroy_notify(hcoll_module->hcoll_context); -#endif - return OMPI_SUCCESS; - -} -/* - * Initialize module on the communicator - */ -static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module, - struct ompi_communicator_t *comm) -{ - int ret; - - if (OMPI_SUCCESS != mca_coll_hcoll_save_coll_handlers((mca_coll_hcoll_module_t *)module)){ - HCOL_ERROR("coll_hcol: mca_coll_hcoll_save_coll_handlers failed"); - return OMPI_ERROR; - } - - ret = ompi_attr_set_c(COMM_ATTR, comm, &comm->c_keyhash, hcoll_comm_attr_keyval, (void *)module, false); - if (OMPI_SUCCESS != ret) { - HCOL_VERBOSE(1,"hcoll ompi_attr_set_c failed"); - return OMPI_ERROR; - } - - return OMPI_SUCCESS; -} - -static int mca_coll_hcoll_module_disable(mca_coll_base_module_t *module, - struct ompi_communicator_t *comm) -{ - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t *)module; - - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, barrier); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, bcast); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, allreduce); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, reduce_scatter_block); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, reduce_scatter); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, reduce); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, allgather); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, allgatherv); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, gatherv); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, scatterv); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, alltoall); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, alltoallv); - - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, ibarrier); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, ibcast); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, iallreduce); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, ireduce); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, iallgather); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, iallgatherv); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, igatherv); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, ialltoall); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, ialltoallv); - - /* - These collectives are not yet part of hcoll, so - don't retain them on hcoll module - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, reduce_scatter); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, gather); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, reduce); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, allgatherv); - HCOL_UNINSTALL_COLL_API(comm, hcoll_module, alltoallw); - */ - return OMPI_SUCCESS; -} - -OBJ_CLASS_INSTANCE(mca_coll_hcoll_dtype_t, - opal_free_list_item_t, - NULL,NULL); - -/* - * Invoked when there's a new communicator that has been created. - * Look at the communicator and decide which set of functions and - * priority we want to return. - */ -mca_coll_base_module_t * -mca_coll_hcoll_comm_query(struct ompi_communicator_t *comm, int *priority) -{ - mca_coll_base_module_t *module; - mca_coll_hcoll_module_t *hcoll_module; - ompi_attribute_fn_ptr_union_t del_fn; - ompi_attribute_fn_ptr_union_t copy_fn; - mca_coll_hcoll_component_t *cm; - int err; - int rc; - cm = &mca_coll_hcoll_component; - *priority = 0; - module = NULL; - - if (!cm->hcoll_enable){ - return NULL; - } - - if (OMPI_COMM_IS_INTER(comm) || ompi_comm_size(comm) < cm->hcoll_np - || ompi_comm_size(comm) < 2){ - return NULL; - } - - - if (!cm->libhcoll_initialized) - { - /* libhcoll should be initialized here since current implementation of - mxm bcol in libhcoll needs world_group fully functional during init - world_group, i.e. ompi_comm_world, is not ready at hcoll component open - call */ - opal_progress_register(hcoll_progress_fn); - - HCOL_VERBOSE(10,"Calling hcoll_init();"); -#if HCOLL_API >= HCOLL_VERSION(3,2) - hcoll_read_init_opts(&cm->init_opts); - cm->init_opts->base_tag = MCA_COLL_BASE_TAG_HCOLL_BASE; - cm->init_opts->max_tag = mca_pml.pml_max_tag; - cm->init_opts->enable_thread_support = ompi_mpi_thread_multiple; - - rc = hcoll_init_with_opts(&cm->init_opts); -#else - hcoll_set_runtime_tag_offset(MCA_COLL_BASE_TAG_HCOLL_BASE, mca_pml.pml_max_tag); - rc = hcoll_init(); -#endif - - if (HCOLL_SUCCESS != rc){ - cm->hcoll_enable = 0; - opal_progress_unregister(hcoll_progress_fn); - HCOL_ERROR("Hcol library init failed"); - return NULL; - } -#if HCOLL_API >= HCOLL_VERSION(3,2) - if (cm->init_opts->mem_hook_needed) { -#else - if (hcoll_check_mem_release_cb_needed()) { -#endif - rc = mca_base_framework_open(&opal_memory_base_framework, 0); - if (OPAL_SUCCESS != rc) { - HCOL_VERBOSE(1, "failed to initialize memory base framework: %d, " - "memory hooks will not be used", rc); - } else { - if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) == - ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) & - opal_mem_hooks_support_level())) { - HCOL_VERBOSE(1, "using OPAL memory hooks as external events"); - cm->using_mem_hooks = 1; - opal_mem_hooks_register_release(mca_coll_hcoll_mem_release_cb, NULL); - setenv("MXM_HCOLL_MEM_ON_DEMAND_MAP", "y", 0); - } - } - } else { - cm->using_mem_hooks = 0; - } - copy_fn.attr_communicator_copy_fn = MPI_COMM_NULL_COPY_FN; - del_fn.attr_communicator_delete_fn = hcoll_comm_attr_del_fn; - err = ompi_attr_create_keyval(COMM_ATTR, copy_fn, del_fn, &hcoll_comm_attr_keyval, NULL ,0, NULL); - if (OMPI_SUCCESS != err) { - cm->hcoll_enable = 0; - hcoll_finalize(); - opal_progress_unregister(hcoll_progress_fn); - HCOL_ERROR("Hcol comm keyval create failed"); - return NULL; - } - - if (mca_coll_hcoll_component.derived_types_support_enabled) { - zero_dte_mapping.type = DTE_ZERO; - copy_fn.attr_datatype_copy_fn = MPI_TYPE_NULL_COPY_FN; - del_fn.attr_datatype_delete_fn = hcoll_type_attr_del_fn; - err = ompi_attr_create_keyval(TYPE_ATTR, copy_fn, del_fn, &hcoll_type_attr_keyval, NULL ,0, NULL); - if (OMPI_SUCCESS != err) { - cm->hcoll_enable = 0; - hcoll_finalize(); - opal_progress_unregister(hcoll_progress_fn); - HCOL_ERROR("Hcol type keyval create failed"); - return NULL; - } - } - OBJ_CONSTRUCT(&cm->dtypes, opal_free_list_t); - opal_free_list_init(&cm->dtypes, sizeof(mca_coll_hcoll_dtype_t), - 8, OBJ_CLASS(mca_coll_hcoll_dtype_t), 0, 0, - 32, -1, 32, NULL, 0, NULL, NULL, NULL); - - } - - hcoll_module = OBJ_NEW(mca_coll_hcoll_module_t); - if (!hcoll_module){ - if (!cm->libhcoll_initialized) { - cm->hcoll_enable = 0; - hcoll_finalize(); - opal_progress_unregister(hcoll_progress_fn); - } - return NULL; - } - - hcoll_module->comm = comm; - - HCOL_VERBOSE(10,"Creating hcoll_context for comm %p, comm_id %d, comm_size %d", - (void*)comm,comm->c_index,ompi_comm_size(comm)); - - hcoll_module->hcoll_context = - hcoll_create_context((rte_grp_handle_t)comm); - - if (NULL == hcoll_module->hcoll_context){ - HCOL_VERBOSE(1,"hcoll_create_context returned NULL"); - OBJ_RELEASE(hcoll_module); - if (!cm->libhcoll_initialized) { - cm->hcoll_enable = 0; - hcoll_finalize(); - opal_progress_unregister(hcoll_progress_fn); - } - return NULL; - } - - hcoll_module->super.coll_module_enable = mca_coll_hcoll_module_enable; - hcoll_module->super.coll_module_disable = mca_coll_hcoll_module_disable; - - *priority = cm->hcoll_priority; - module = &hcoll_module->super; - - if (!cm->libhcoll_initialized) { - cm->libhcoll_initialized = true; - } - - return module; -} - - -OBJ_CLASS_INSTANCE(mca_coll_hcoll_module_t, - mca_coll_base_module_t, - mca_coll_hcoll_module_construct, - mca_coll_hcoll_module_destruct); - -static void safety_valve(void) __opal_attribute_destructor__; -void safety_valve(void) { - opal_mem_hooks_unregister_release(mca_coll_hcoll_mem_release_cb); -} diff --git a/ompi/mca/coll/hcoll/coll_hcoll_ops.c b/ompi/mca/coll/hcoll/coll_hcoll_ops.c deleted file mode 100644 index e491899d2dd..00000000000 --- a/ompi/mca/coll/hcoll/coll_hcoll_ops.c +++ /dev/null @@ -1,945 +0,0 @@ -/** - Copyright (c) 2011 Mellanox Technologies. All rights reserved. - Copyright (c) 2015 Research Organization for Information Science - and Technology (RIST). All rights reserved. - Copyright (c) 2018 Cisco Systems, Inc. All rights reserved - $COPYRIGHT$ - - Additional copyrights may follow - - $HEADER$ - */ - -#include "ompi_config.h" -#include "ompi/constants.h" -#include "coll_hcoll.h" -#include "hcoll/api/hcoll_constants.h" -#include "coll_hcoll_dtypes.h" -#include "hcoll/api/hcoll_dte.h" -int mca_coll_hcoll_barrier(struct ompi_communicator_t *comm, - mca_coll_base_module_t *module){ - int rc; - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - HCOL_VERBOSE(20,"RUNNING HCOL BARRIER"); - - if (OPAL_UNLIKELY(ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED)) { - HCOL_VERBOSE(5, "In finalize, reverting to previous barrier"); - goto orig_barrier; - } - rc = hcoll_collectives.coll_barrier(hcoll_module->hcoll_context); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK BARRIER"); - rc = hcoll_module->previous_barrier(comm,hcoll_module->previous_barrier_module); - } - return rc; -orig_barrier: - return hcoll_module->previous_barrier(comm,hcoll_module->previous_barrier_module); -} - -int mca_coll_hcoll_bcast(void *buff, size_t count, - struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - dte_data_representation_t dtype; - int rc; - HCOL_VERBOSE(20,"RUNNING HCOL BCAST"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - dtype = ompi_dtype_2_hcoll_dtype(datatype, TRY_FIND_DERIVED); - - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(dtype))) { - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: %s; calling fallback bcast;",datatype->super.name); - rc = hcoll_module->previous_bcast(buff,count,datatype,root, - comm,hcoll_module->previous_bcast_module); - return rc; - } - rc = hcoll_collectives.coll_bcast(buff,count,dtype,root,hcoll_module->hcoll_context); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK BCAST"); - rc = hcoll_module->previous_bcast(buff,count,datatype,root, - comm,hcoll_module->previous_bcast_module); - } - return rc; -} - -int mca_coll_hcoll_allgather(const void *sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void *rbuf, size_t rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - dte_data_representation_t stype; - dte_data_representation_t rtype; - int rc; - HCOL_VERBOSE(20,"RUNNING HCOL ALLGATHER"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - stype = ompi_dtype_2_hcoll_dtype(sdtype, TRY_FIND_DERIVED); - rtype = ompi_dtype_2_hcoll_dtype(rdtype, TRY_FIND_DERIVED); - if (sbuf == MPI_IN_PLACE) { - stype = rtype; - } - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype))) { - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: sdtype = %s, rdtype = %s; calling fallback allgather;", - sdtype->super.name, - rdtype->super.name); - rc = hcoll_module->previous_allgather(sbuf,scount,sdtype, - rbuf,rcount,rdtype, - comm, - hcoll_module->previous_allgather_module); - return rc; - } - rc = hcoll_collectives.coll_allgather((void *)sbuf,scount,stype,rbuf,rcount,rtype,hcoll_module->hcoll_context); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK ALLGATHER"); - rc = hcoll_module->previous_allgather(sbuf,scount,sdtype, - rbuf,rcount,rdtype, - comm, - hcoll_module->previous_allgather_module); - } - return rc; -} - -int mca_coll_hcoll_allgatherv(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, ompi_count_array_t rcount, - ompi_disp_array_t displs, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - dte_data_representation_t stype; - dte_data_representation_t rtype; - int rc; - HCOL_VERBOSE(20,"RUNNING HCOL ALLGATHERV"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - stype = ompi_dtype_2_hcoll_dtype(sdtype, NO_DERIVED); - rtype = ompi_dtype_2_hcoll_dtype(rdtype, NO_DERIVED); - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype) - || ompi_count_array_is_64bit(rcount))) { - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: sdtype = %s, rdtype = %s; calling fallback allgatherv;", - sdtype->super.name, - rdtype->super.name); - rc = hcoll_module->previous_allgatherv(sbuf,scount,sdtype, - rbuf,rcount, - displs, - rdtype, - comm, - hcoll_module->previous_allgatherv_module); - return rc; - } - rc = hcoll_collectives.coll_allgatherv((void *)sbuf, - scount, - stype, - rbuf, - (void *)ompi_count_array_ptr(rcount), - (void *)ompi_disp_array_ptr(displs), - rtype, - hcoll_module->hcoll_context); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK ALLGATHERV"); - rc = hcoll_module->previous_allgatherv(sbuf,scount,sdtype, - rbuf,rcount, - displs, - rdtype, - comm, - hcoll_module->previous_allgatherv_module); - } - return rc; -} - -int mca_coll_hcoll_gather(const void *sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void *rbuf, size_t rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - dte_data_representation_t stype; - dte_data_representation_t rtype; - int rc; - - HCOL_VERBOSE(20,"RUNNING HCOL GATHER"); - - if (root != comm->c_my_rank) { - rdtype = sdtype; - } - - stype = ompi_dtype_2_hcoll_dtype(sdtype, NO_DERIVED); - rtype = ompi_dtype_2_hcoll_dtype(rdtype, NO_DERIVED); - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype))) { - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: sdtype = %s, rdtype = %s; calling fallback gather;", - sdtype->super.name, - rdtype->super.name); - rc = hcoll_module->previous_gather(sbuf,scount,sdtype, - rbuf,rcount,rdtype,root, - comm, - hcoll_module->previous_allgather_module); - return rc; - } - rc = hcoll_collectives.coll_gather((void *)sbuf,scount,stype,rbuf,rcount,rtype,root,hcoll_module->hcoll_context); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK GATHER"); - rc = hcoll_module->previous_gather((void *)sbuf,scount,sdtype, - rbuf,rcount,rdtype,root, - comm, - hcoll_module->previous_allgather_module); - } - return rc; - -} - -int mca_coll_hcoll_allreduce(const void *sbuf, void *rbuf, size_t count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - dte_data_representation_t Dtype; - hcoll_dte_op_t *Op; - int rc; - HCOL_VERBOSE(20,"RUNNING HCOL ALLREDUCE"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - Dtype = ompi_dtype_2_hcoll_dtype(dtype, NO_DERIVED); - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(Dtype))){ - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: dtype = %s; calling fallback allreduce;", - dtype->super.name); - rc = hcoll_module->previous_allreduce(sbuf,rbuf, - count,dtype,op, - comm, hcoll_module->previous_allreduce_module); - return rc; - } - - Op = ompi_op_2_hcolrte_op(op); - if (OPAL_UNLIKELY(HCOL_DTE_OP_NULL == Op->id)){ - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"ompi_op_t is not supported: op = %s; calling fallback allreduce;", - op->o_name); - rc = hcoll_module->previous_allreduce(sbuf,rbuf, - count,dtype,op, - comm, hcoll_module->previous_allreduce_module); - return rc; - } - - rc = hcoll_collectives.coll_allreduce((void *)sbuf,rbuf,count,Dtype,Op,hcoll_module->hcoll_context); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK ALLREDUCE"); - rc = hcoll_module->previous_allreduce(sbuf,rbuf, - count,dtype,op, - comm, hcoll_module->previous_allreduce_module); - } - return rc; -} - -int mca_coll_hcoll_reduce(const void *sbuf, void *rbuf, size_t count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - dte_data_representation_t Dtype; - hcoll_dte_op_t *Op; - int rc; - HCOL_VERBOSE(20,"RUNNING HCOL REDUCE"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - Dtype = ompi_dtype_2_hcoll_dtype(dtype, NO_DERIVED); - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(Dtype))){ - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: dtype = %s; calling fallback reduce;", - dtype->super.name); - rc = hcoll_module->previous_reduce(sbuf,rbuf, - count,dtype,op, - root, - comm, hcoll_module->previous_reduce_module); - return rc; - } - - Op = ompi_op_2_hcolrte_op(op); - if (OPAL_UNLIKELY(HCOL_DTE_OP_NULL == Op->id)){ - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"ompi_op_t is not supported: op = %s; calling fallback reduce;", - op->o_name); - rc = hcoll_module->previous_reduce(sbuf,rbuf, - count,dtype,op, - root, - comm, hcoll_module->previous_reduce_module); - return rc; - } - - rc = hcoll_collectives.coll_reduce((void *)sbuf,rbuf,count,Dtype,Op,root,hcoll_module->hcoll_context); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK REDUCE"); - rc = hcoll_module->previous_reduce(sbuf,rbuf, - count,dtype,op, - root, - comm, hcoll_module->previous_reduce_module); - } - return rc; -} - -int mca_coll_hcoll_alltoall(const void *sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void* rbuf, size_t rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - dte_data_representation_t stype; - dte_data_representation_t rtype; - int rc; - HCOL_VERBOSE(20,"RUNNING HCOL ALLTOALL"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - stype = ompi_dtype_2_hcoll_dtype(sdtype, NO_DERIVED); - rtype = ompi_dtype_2_hcoll_dtype(rdtype, NO_DERIVED); - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype))) { - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: sdtype = %s, rdtype = %s; calling fallback alltoall;", - sdtype->super.name, - rdtype->super.name); - rc = hcoll_module->previous_alltoall(sbuf,scount,sdtype, - rbuf,rcount,rdtype, - comm, - hcoll_module->previous_alltoall_module); - return rc; - } - rc = hcoll_collectives.coll_alltoall((void *)sbuf,scount,stype,rbuf,rcount,rtype,hcoll_module->hcoll_context); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK ALLTOALL"); - rc = hcoll_module->previous_alltoall(sbuf,scount,sdtype, - rbuf,rcount,rdtype, - comm, - hcoll_module->previous_alltoall_module); - } - return rc; -} - -int mca_coll_hcoll_alltoallv(const void *sbuf, ompi_count_array_t scounts, ompi_disp_array_t sdisps, - struct ompi_datatype_t *sdtype, - void *rbuf, ompi_count_array_t rcounts, ompi_disp_array_t rdisps, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - dte_data_representation_t stype; - dte_data_representation_t rtype; - int rc; - HCOL_VERBOSE(20,"RUNNING HCOL ALLTOALLV"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - stype = ompi_dtype_2_hcoll_dtype(sdtype, NO_DERIVED); - rtype = ompi_dtype_2_hcoll_dtype(rdtype, NO_DERIVED); - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype) - || ompi_count_array_is_64bit(scounts))) { - HCOL_VERBOSE(20,"Ompi_datatype is not supported: sdtype = %s, rdtype = %s; calling fallback alltoallv;", - sdtype->super.name, - rdtype->super.name); - rc = hcoll_module->previous_alltoallv(sbuf, scounts, sdisps, sdtype, - rbuf, rcounts, rdisps, rdtype, - comm, hcoll_module->previous_alltoallv_module); - return rc; - } - rc = hcoll_collectives.coll_alltoallv((void *)sbuf, - (void *)ompi_count_array_ptr(scounts), - (void *)ompi_disp_array_ptr(sdisps), - stype, - rbuf, - (void *)ompi_count_array_ptr(rcounts), - (void *)ompi_disp_array_ptr(rdisps), - rtype, - hcoll_module->hcoll_context); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK ALLTOALLV"); - rc = hcoll_module->previous_alltoallv(sbuf, scounts, sdisps, sdtype, - rbuf, rcounts, rdisps, rdtype, - comm, hcoll_module->previous_alltoallv_module); - } - return rc; -} - -int mca_coll_hcoll_gatherv(const void* sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void* rbuf, ompi_count_array_t rcounts, ompi_disp_array_t displs, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - dte_data_representation_t stype; - dte_data_representation_t rtype; - int rc; - HCOL_VERBOSE(20,"RUNNING HCOL GATHERV"); - - if (root != comm->c_my_rank) { - rdtype = sdtype; - } - - stype = ompi_dtype_2_hcoll_dtype(sdtype, NO_DERIVED); - rtype = ompi_dtype_2_hcoll_dtype(rdtype, NO_DERIVED); - - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype) - || ompi_count_array_is_64bit(rcounts))) { - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: sdtype = %s, rdtype = %s; calling fallback gatherv;", - sdtype->super.name, - rdtype->super.name); - rc = hcoll_module->previous_gatherv(sbuf,scount,sdtype, - rbuf, rcounts, displs, rdtype,root, - comm, hcoll_module->previous_gatherv_module); - return rc; - } - rc = hcoll_collectives.coll_gatherv((void *)sbuf, - scount, - stype, - rbuf, - (void *)ompi_count_array_ptr(rcounts), - (void *)ompi_disp_array_ptr(displs), - rtype, - root, - hcoll_module->hcoll_context); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK GATHERV"); - rc = hcoll_module->previous_gatherv(sbuf,scount,sdtype, - rbuf, rcounts, displs, rdtype,root, - comm, hcoll_module->previous_igatherv_module); - } - return rc; - -} - -int mca_coll_hcoll_scatterv(const void* sbuf, ompi_count_array_t scounts, ompi_disp_array_t displs, - struct ompi_datatype_t *sdtype, - void* rbuf, size_t rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - dte_data_representation_t stype; - dte_data_representation_t rtype; - int rc; - - HCOL_VERBOSE(20,"RUNNING HCOL SCATTERV"); - - if (root != comm->c_my_rank) { - sdtype = rdtype; - } - - stype = ompi_dtype_2_hcoll_dtype(sdtype, NO_DERIVED); - rtype = ompi_dtype_2_hcoll_dtype(rdtype, NO_DERIVED); - - if (rbuf == MPI_IN_PLACE) { - assert(root == comm->c_my_rank); - rtype = stype; - } - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype) - || ompi_count_array_is_64bit(scounts))) { - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: sdtype = %s, rdtype = %s; calling fallback scatterv;", - sdtype->super.name, - rdtype->super.name); - rc = hcoll_module->previous_scatterv(sbuf, scounts, displs, sdtype, - rbuf, rcount, rdtype, root, - comm, hcoll_module->previous_scatterv_module); - return rc; - } - rc = hcoll_collectives.coll_scatterv((void *)sbuf, - (void *)ompi_count_array_ptr(scounts), - (void *)ompi_disp_array_ptr(displs), - stype, - rbuf, - rcount, - rtype, - root, - hcoll_module->hcoll_context); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK SCATTERV"); - rc = hcoll_module->previous_scatterv(sbuf, scounts, displs, - sdtype, - rbuf, - rcount, - rdtype, - root, - comm, - hcoll_module->previous_scatterv_module); - } - return rc; -} - -int mca_coll_hcoll_ibarrier(struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t *module) -{ - int rc; - void** rt_handle; - HCOL_VERBOSE(20,"RUNNING HCOL NON-BLOCKING BARRIER"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - rt_handle = (void**) request; - rc = hcoll_collectives.coll_ibarrier(hcoll_module->hcoll_context, rt_handle); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK NON-BLOCKING BARRIER"); - rc = hcoll_module->previous_ibarrier(comm, request, hcoll_module->previous_ibarrier_module); - } - return rc; -} - -int mca_coll_hcoll_ibcast(void *buff, size_t count, - struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t *module) -{ - dte_data_representation_t dtype; - int rc; - void** rt_handle; - HCOL_VERBOSE(20,"RUNNING HCOL NON-BLOCKING BCAST"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - rt_handle = (void**) request; - dtype = ompi_dtype_2_hcoll_dtype(datatype, TRY_FIND_DERIVED); - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(dtype))){ - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: %s; calling fallback non-blocking bcast;",datatype->super.name); - rc = hcoll_module->previous_ibcast(buff,count,datatype,root, - comm, request, hcoll_module->previous_ibcast_module); - return rc; - } - rc = hcoll_collectives.coll_ibcast(buff, count, dtype, root, rt_handle, hcoll_module->hcoll_context); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK NON-BLOCKING BCAST"); - rc = hcoll_module->previous_ibcast(buff,count,datatype,root, - comm, request, hcoll_module->previous_ibcast_module); - } - return rc; -} - -int mca_coll_hcoll_iallgather(const void *sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void *rbuf, size_t rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t *module) -{ - dte_data_representation_t stype; - dte_data_representation_t rtype; - int rc; - void** rt_handle; - HCOL_VERBOSE(20,"RUNNING HCOL NON-BLOCKING ALLGATHER"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - rt_handle = (void**) request; - stype = ompi_dtype_2_hcoll_dtype(sdtype, TRY_FIND_DERIVED); - rtype = ompi_dtype_2_hcoll_dtype(rdtype, TRY_FIND_DERIVED); - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype))) { - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: sdtype = %s, rdtype = %s; calling fallback non-blocking allgather;", - sdtype->super.name, - rdtype->super.name); - rc = hcoll_module->previous_iallgather(sbuf,scount,sdtype, - rbuf,rcount,rdtype, - comm, - request, - hcoll_module->previous_iallgather_module); - return rc; - } - rc = hcoll_collectives.coll_iallgather((void *)sbuf, scount, stype, rbuf, rcount, rtype, hcoll_module->hcoll_context, rt_handle); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK NON-BLOCKING ALLGATHER"); - rc = hcoll_module->previous_iallgather(sbuf,scount,sdtype, - rbuf,rcount,rdtype, - comm, - request, - hcoll_module->previous_iallgather_module); - } - return rc; -} -#if HCOLL_API >= HCOLL_VERSION(3,5) -int mca_coll_hcoll_iallgatherv(const void *sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void *rbuf, ompi_count_array_t rcount, - ompi_disp_array_t displs, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t *module) -{ - dte_data_representation_t stype; - dte_data_representation_t rtype; - int rc; - HCOL_VERBOSE(20,"RUNNING HCOL NON-BLOCKING ALLGATHERV"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - stype = ompi_dtype_2_hcoll_dtype(sdtype, NO_DERIVED); - rtype = ompi_dtype_2_hcoll_dtype(rdtype, NO_DERIVED); - void **rt_handle = (void **) request; - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype) - || ompi_count_array_is_64bit(rcount))) { - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: sdtype = %s, rdtype = %s; calling fallback non-blocking allgatherv;", - sdtype->super.name, - rdtype->super.name); - rc = hcoll_module->previous_iallgatherv(sbuf,scount,sdtype, - rbuf,rcount, - displs, - rdtype, - comm, - request, - hcoll_module->previous_iallgatherv_module); - return rc; - } - rc = hcoll_collectives.coll_iallgatherv((void *)sbuf, - scount, - stype, - rbuf, - (void *)ompi_count_array_ptr(rcount), - (void *)ompi_disp_array_ptr(displs), - rtype, - hcoll_module->hcoll_context, rt_handle); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK NON-BLOCKING ALLGATHER"); - rc = hcoll_module->previous_iallgatherv(sbuf,scount,sdtype, - rbuf,rcount, - displs, - rdtype, - comm, - request, - hcoll_module->previous_iallgatherv_module); - } - return rc; -} -#endif -int mca_coll_hcoll_iallreduce(const void *sbuf, void *rbuf, size_t count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t *module) -{ - dte_data_representation_t Dtype; - hcoll_dte_op_t *Op; - int rc; - void** rt_handle; - HCOL_VERBOSE(20,"RUNNING HCOL NON-BLOCKING ALLREDUCE"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - rt_handle = (void**) request; - Dtype = ompi_dtype_2_hcoll_dtype(dtype, NO_DERIVED); - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(Dtype))){ - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: dtype = %s; calling fallback non-blocking allreduce;", - dtype->super.name); - rc = hcoll_module->previous_iallreduce(sbuf,rbuf, - count,dtype,op, - comm, request, hcoll_module->previous_iallreduce_module); - return rc; - } - - Op = ompi_op_2_hcolrte_op(op); - if (OPAL_UNLIKELY(HCOL_DTE_OP_NULL == Op->id)){ - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"ompi_op_t is not supported: op = %s; calling fallback non-blocking allreduce;", - op->o_name); - rc = hcoll_module->previous_iallreduce(sbuf,rbuf, - count,dtype,op, - comm, request, hcoll_module->previous_iallreduce_module); - return rc; - } - - rc = hcoll_collectives.coll_iallreduce((void *)sbuf, rbuf, count, Dtype, Op, hcoll_module->hcoll_context, rt_handle); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK NON-BLOCKING ALLREDUCE"); - rc = hcoll_module->previous_iallreduce(sbuf,rbuf, - count,dtype,op, - comm, request, hcoll_module->previous_iallreduce_module); - } - return rc; -} -#if HCOLL_API >= HCOLL_VERSION(3,5) -int mca_coll_hcoll_ireduce(const void *sbuf, void *rbuf, size_t count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t *module) -{ - dte_data_representation_t Dtype; - hcoll_dte_op_t *Op; - int rc; - HCOL_VERBOSE(20,"RUNNING HCOL NON-BLOCKING REDUCE"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - Dtype = ompi_dtype_2_hcoll_dtype(dtype, NO_DERIVED); - void **rt_handle = (void**) request; - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(Dtype))){ - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: dtype = %s; calling fallback non-blocking reduce;", - dtype->super.name); - rc = hcoll_module->previous_ireduce(sbuf,rbuf,count,dtype,op, - root, - comm, request, - hcoll_module->previous_ireduce_module); - return rc; - } - - Op = ompi_op_2_hcolrte_op(op); - if (OPAL_UNLIKELY(HCOL_DTE_OP_NULL == Op->id)){ - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"ompi_op_t is not supported: op = %s; calling fallback non-blocking reduce;", - op->o_name); - rc = hcoll_module->previous_ireduce(sbuf,rbuf, - count,dtype,op, - root, - comm, request, - hcoll_module->previous_ireduce_module); - return rc; - } - - rc = hcoll_collectives.coll_ireduce((void *)sbuf,rbuf,count,Dtype,Op,root,hcoll_module->hcoll_context,rt_handle); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK NON-BLOCKING REDUCE"); - rc = hcoll_module->previous_ireduce(sbuf,rbuf, - count,dtype,op, - root, - comm, - request, - hcoll_module->previous_ireduce_module); - } - return rc; -} -#endif -int mca_coll_hcoll_igatherv(const void* sbuf, size_t scount, - struct ompi_datatype_t *sdtype, - void* rbuf, ompi_count_array_t rcounts, ompi_disp_array_t displs, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t *module) -{ - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - dte_data_representation_t stype; - dte_data_representation_t rtype; - int rc; - void** rt_handle; - - HCOL_VERBOSE(20,"RUNNING HCOL IGATHERV"); - - rt_handle = (void**) request; - - if (root != comm->c_my_rank) { - rdtype = sdtype; - } - - stype = ompi_dtype_2_hcoll_dtype(sdtype, NO_DERIVED); - rtype = ompi_dtype_2_hcoll_dtype(rdtype, NO_DERIVED); - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype) - || ompi_count_array_is_64bit(rcounts))) { - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: sdtype = %s, rdtype = %s; calling fallback igatherv;", - sdtype->super.name, - rdtype->super.name); - rc = hcoll_module->previous_igatherv(sbuf,scount,sdtype, - rbuf, rcounts, displs, rdtype,root, - comm, request, - hcoll_module->previous_igatherv_module); - return rc; - } - rc = hcoll_collectives.coll_igatherv((void *)sbuf, - scount, - stype, - rbuf, - (void *)ompi_count_array_ptr(rcounts), - (void *)ompi_disp_array_ptr(displs), - rtype, - root, - hcoll_module->hcoll_context, - rt_handle); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK IGATHERV"); - rc = hcoll_module->previous_igatherv(sbuf,scount,sdtype, - rbuf, rcounts, displs, rdtype,root, - comm, request, - hcoll_module->previous_igatherv_module); - } - return rc; - -} - - -#if HCOLL_API >= HCOLL_VERSION(3,7) -int mca_coll_hcoll_ialltoallv(const void *sbuf, ompi_count_array_t scounts, ompi_disp_array_t sdisps, - struct ompi_datatype_t *sdtype, - void *rbuf, ompi_count_array_t rcounts, ompi_disp_array_t rdisps, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t *module) -{ - dte_data_representation_t stype; - dte_data_representation_t rtype; - int rc; - HCOL_VERBOSE(20,"RUNNING HCOL IALLTOALLV"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - stype = ompi_dtype_2_hcoll_dtype(sdtype, NO_DERIVED); - rtype = ompi_dtype_2_hcoll_dtype(rdtype, NO_DERIVED); - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype) - || ompi_count_array_is_64bit(scounts))) { - HCOL_VERBOSE(20,"Ompi_datatype is not supported: sdtype = %s, rdtype = %s; calling fallback ialltoallv;", - sdtype->super.name, - rdtype->super.name); - rc = hcoll_module->previous_ialltoallv(sbuf, scounts, sdisps, sdtype, - rbuf, rcounts, rdisps, rdtype, - comm, request, hcoll_module->previous_alltoallv_module); - return rc; - } - rc = hcoll_collectives.coll_ialltoallv((void *)sbuf, - (void *)ompi_count_array_ptr(scounts), - (void *)ompi_disp_array_ptr(sdisps), - stype, - rbuf, - (void *)ompi_count_array_ptr(rcounts), - (void *)ompi_disp_array_ptr(rdisps), - rtype, - hcoll_module->hcoll_context, - (void**)request); - if (HCOLL_SUCCESS != rc){ - HCOL_VERBOSE(20,"RUNNING FALLBACK IALLTOALLV"); - rc = hcoll_module->previous_ialltoallv(sbuf, scounts, sdisps, sdtype, - rbuf, rcounts, rdisps, rdtype, - comm, request, hcoll_module->previous_alltoallv_module); - } - return rc; -} -#endif - -#if HCOLL_API > HCOLL_VERSION(4,5) -int mca_coll_hcoll_reduce_scatter_block(const void *sbuf, void *rbuf, size_t rcount, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) { - dte_data_representation_t Dtype; - hcoll_dte_op_t *Op; - int rc; - HCOL_VERBOSE(20,"RUNNING HCOL REDUCE SCATTER BLOCK"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - Dtype = ompi_dtype_2_hcoll_dtype(dtype, NO_DERIVED); - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(Dtype))){ - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: dtype = %s; calling fallback allreduce;", - dtype->super.name); - goto fallback; - } - - Op = ompi_op_2_hcolrte_op(op); - if (OPAL_UNLIKELY(HCOL_DTE_OP_NULL == Op->id)){ - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"ompi_op_t is not supported: op = %s; calling fallback allreduce;", - op->o_name); - goto fallback; - } - - rc = hcoll_collectives.coll_reduce_scatter_block((void *)sbuf,rbuf,rcount,Dtype,Op,hcoll_module->hcoll_context); - if (HCOLL_SUCCESS != rc){ - fallback: - HCOL_VERBOSE(20,"RUNNING FALLBACK ALLREDUCE"); - rc = hcoll_module->previous_reduce_scatter_block(sbuf,rbuf, - rcount,dtype,op, - comm, hcoll_module->previous_allreduce_module); - } - return rc; -} - -int mca_coll_hcoll_reduce_scatter(const void *sbuf, void *rbuf, ompi_count_array_t rcounts, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) { - dte_data_representation_t Dtype; - hcoll_dte_op_t *Op; - int rc; - HCOL_VERBOSE(20,"RUNNING HCOL REDUCE SCATTER"); - mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; - Dtype = ompi_dtype_2_hcoll_dtype(dtype, NO_DERIVED); - if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(Dtype) || ompi_count_array_is_64bit(rcounts))){ - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"Ompi_datatype is not supported: dtype = %s; calling fallback allreduce;", - dtype->super.name); - goto fallback; - } - - Op = ompi_op_2_hcolrte_op(op); - if (OPAL_UNLIKELY(HCOL_DTE_OP_NULL == Op->id)){ - /*If we are here then datatype is not simple predefined datatype */ - /*In future we need to add more complex mapping to the dte_data_representation_t */ - /* Now use fallback */ - HCOL_VERBOSE(20,"ompi_op_t is not supported: op = %s; calling fallback allreduce;", - op->o_name); - goto fallback; - } - - rc = hcoll_collectives.coll_reduce_scatter((void *)sbuf, - rbuf, - (void *)ompi_count_array_ptr(rcounts), - Dtype, - Op, - hcoll_module->hcoll_context); - if (HCOLL_SUCCESS != rc){ - fallback: - HCOL_VERBOSE(20,"RUNNING FALLBACK ALLREDUCE"); - rc = hcoll_module->previous_reduce_scatter(sbuf,rbuf, - rcounts,dtype,op, - comm, hcoll_module->previous_allreduce_module); - } - return rc; -} -#endif diff --git a/ompi/mca/coll/hcoll/coll_hcoll_rte.c b/ompi/mca/coll/hcoll/coll_hcoll_rte.c deleted file mode 100644 index 882880f645e..00000000000 --- a/ompi/mca/coll/hcoll/coll_hcoll_rte.c +++ /dev/null @@ -1,487 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2015-2019 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" -#ifdef HAVE_UNISTD_H -#include -#endif -#include -#ifdef HAVE_SYS_MMAN_H -#include -#endif -#include -#include - -#include "coll_hcoll.h" - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/datatype/ompi_datatype_internal.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/mca/coll/base/coll_base_util.h" - -#include "hcoll/api/hcoll_dte.h" -#include "hcoll/api/hcoll_api.h" -#include "hcoll/api/hcoll_constants.h" -#include "coll_hcoll_dtypes.h" -/* - * Local functions - */ - - -static int recv_nb(dte_data_representation_t data , - size_t count , - void *buffer, - rte_ec_handle_t , - rte_grp_handle_t , - uint32_t tag, - rte_request_handle_t * req); - -static int send_nb(dte_data_representation_t data, - size_t count, - void *buffer, - rte_ec_handle_t ec_h, - rte_grp_handle_t grp_h, - uint32_t tag, rte_request_handle_t *req); - -static int test( rte_request_handle_t * request , - int * completed ); - -static int ec_handle_compare( rte_ec_handle_t handle_1 , - rte_grp_handle_t - group_handle_1 , - rte_ec_handle_t handle_2 , - rte_grp_handle_t - group_handle_2 ); - -static int get_ec_handles( int num_ec , - int * ec_indexes , - rte_grp_handle_t , - rte_ec_handle_t * ec_handles ); - -#if 0 /* This callback is not used */ -static int get_my_ec(rte_grp_handle_t , rte_ec_handle_t *ec_handle); -#endif - -static int group_size ( rte_grp_handle_t group ); -static int my_rank (rte_grp_handle_t grp_h); -static int ec_on_local_node (rte_ec_handle_t ec, rte_grp_handle_t group); -static rte_grp_handle_t get_world_group_handle(void); -static uint32_t jobid(void); - -static void progress(void){ - opal_progress(); -} - -static void* get_coll_handle(void); -static int coll_handle_test(void* handle); -static void coll_handle_free(void *handle); -static void coll_handle_complete(void *handle); -static int group_id(rte_grp_handle_t group); - -static int world_rank(rte_grp_handle_t grp_h, rte_ec_handle_t ec); -/* Module Constructors */ -#if HCOLL_API >= HCOLL_VERSION(3,6) -static int get_mpi_type_envelope(void *mpi_type, int *num_integers, - int *num_addresses, int *num_datatypes, - hcoll_mpi_type_combiner_t *combiner); -static int get_mpi_type_contents(void *mpi_type, int max_integers, int max_addresses, - int max_datatypes, int *array_of_integers, - void *array_of_addresses, void *array_of_datatypes); -static int get_hcoll_type(void *mpi_type, dte_data_representation_t *hcoll_type); -static int set_hcoll_type(void *mpi_type, dte_data_representation_t hcoll_type); -static int get_mpi_constants(size_t *mpi_datatype_size, - int *mpi_order_c, int *mpi_order_fortran, - int *mpi_distribute_block, - int *mpi_distribute_cyclic, - int *mpi_distribute_none, - int *mpi_distribute_dflt_darg); -#endif - -static void init_module_fns(void){ - hcoll_rte_functions.send_fn = send_nb; - hcoll_rte_functions.recv_fn = recv_nb; - hcoll_rte_functions.ec_cmp_fn = ec_handle_compare; - hcoll_rte_functions.get_ec_handles_fn = get_ec_handles; - hcoll_rte_functions.rte_group_size_fn = group_size; - hcoll_rte_functions.test_fn = test; - hcoll_rte_functions.rte_my_rank_fn = my_rank; - hcoll_rte_functions.rte_ec_on_local_node_fn = ec_on_local_node; - hcoll_rte_functions.rte_world_group_fn = get_world_group_handle; - hcoll_rte_functions.rte_jobid_fn = jobid; - hcoll_rte_functions.rte_progress_fn = progress; - hcoll_rte_functions.rte_get_coll_handle_fn = get_coll_handle; - hcoll_rte_functions.rte_coll_handle_test_fn = coll_handle_test; - hcoll_rte_functions.rte_coll_handle_free_fn = coll_handle_free; - hcoll_rte_functions.rte_coll_handle_complete_fn = coll_handle_complete; - hcoll_rte_functions.rte_group_id_fn = group_id; - hcoll_rte_functions.rte_world_rank_fn = world_rank; -#if HCOLL_API >= HCOLL_VERSION(3,6) - hcoll_rte_functions.rte_get_mpi_type_envelope_fn = get_mpi_type_envelope; - hcoll_rte_functions.rte_get_mpi_type_contents_fn = get_mpi_type_contents; - hcoll_rte_functions.rte_get_hcoll_type_fn = get_hcoll_type; - hcoll_rte_functions.rte_set_hcoll_type_fn = set_hcoll_type; - hcoll_rte_functions.rte_get_mpi_constants_fn = get_mpi_constants; -#endif -} - - -void hcoll_rte_fns_setup(void) -{ - init_module_fns(); - OBJ_CONSTRUCT(&mca_coll_hcoll_component.requests, opal_free_list_t); - opal_free_list_init(&(mca_coll_hcoll_component.requests), - sizeof(ompi_coll_base_nbc_request_t), - opal_cache_line_size, OBJ_CLASS(ompi_coll_base_nbc_request_t), - /* no payload data */ - 0, 0, 10, -1, 10, - /* No Mpool or init function */ - NULL, 0, NULL, NULL, NULL); -} - -static int recv_nb(struct dte_data_representation_t data, - size_t count, - void *buffer, - rte_ec_handle_t ec_h, - rte_grp_handle_t grp_h, - uint32_t tag, - rte_request_handle_t *req) -{ - ompi_communicator_t *comm = (ompi_communicator_t *)grp_h; - - if (NULL == ec_h.handle && -1 != ec_h.rank) { - fprintf(stderr,"***Error in hcolrte_rml_recv_nb: wrong null argument: " - "ec_h.handle = %p, ec_h.rank = %d\n",ec_h.handle,ec_h.rank); - return HCOLL_ERROR; - } - assert(HCOL_DTE_IS_INLINE(data)); - /*do inline nb recv*/ - size_t size; - ompi_request_t *ompi_req; - - if (!buffer && !HCOL_DTE_IS_ZERO(data)) { - fprintf(stderr, "***Error in hcolrte_rml_recv_nb: buffer pointer is NULL" - " for non DTE_ZERO INLINE data representation\n"); - return HCOLL_ERROR; - } - size = (size_t)data.rep.in_line_rep.data_handle.in_line.packed_size*count/8; - - HCOL_VERBOSE(30,"PML_IRECV: dest = %d: buf = %p: size = %zu: comm = %p", - ec_h.rank, buffer, size, (void *)comm); - if (MCA_PML_CALL(irecv(buffer,size,&(ompi_mpi_unsigned_char.dt),ec_h.rank, - tag,comm,&ompi_req))) - { - return HCOLL_ERROR; - } - req->data = (void *)ompi_req; - req->status = HCOLRTE_REQUEST_ACTIVE; - - return HCOLL_SUCCESS; -} - - -static int send_nb( dte_data_representation_t data, - size_t count, - void *buffer, - rte_ec_handle_t ec_h, - rte_grp_handle_t grp_h, - uint32_t tag, - rte_request_handle_t *req) -{ - ompi_communicator_t *comm = (ompi_communicator_t *)grp_h; - - if (! ec_h.handle) { - fprintf(stderr,"***Error in hcolrte_rml_send_nb: wrong null argument: " - "ec_h.handle = %p, ec_h.rank = %d\n",ec_h.handle,ec_h.rank); - return HCOLL_ERROR; - } - assert(HCOL_DTE_IS_INLINE(data)); - /*do inline nb recv*/ - size_t size; - ompi_request_t *ompi_req; - if (!buffer && !HCOL_DTE_IS_ZERO(data)) { - fprintf(stderr, "***Error in hcolrte_rml_send_nb: buffer pointer is NULL" - " for non DTE_ZERO INLINE data representation\n"); - return HCOLL_ERROR; - } - size = (size_t)data.rep.in_line_rep.data_handle.in_line.packed_size*count/8; - HCOL_VERBOSE(30,"PML_ISEND: dest = %d: buf = %p: size = %zu: comm = %p", - ec_h.rank, buffer, size, (void *)comm); - if (MCA_PML_CALL(isend(buffer,size,&(ompi_mpi_unsigned_char.dt),ec_h.rank, - tag,MCA_PML_BASE_SEND_STANDARD,comm,&ompi_req))) - { - return HCOLL_ERROR; - } - req->data = (void *)ompi_req; - req->status = HCOLRTE_REQUEST_ACTIVE; - return HCOLL_SUCCESS; -} - -static int test( rte_request_handle_t * request , - int * completed ) -{ - ompi_request_t * ompi_req = (ompi_request_t *)request->data; - if (HCOLRTE_REQUEST_ACTIVE != request->status){ - *completed = true; - return HCOLL_SUCCESS; - } - - /*ompi_request_test(&ompi_req,completed,MPI_STATUS_IGNORE); */ - *completed = REQUEST_COMPLETE(ompi_req); - if (*completed){ - ompi_request_free(&ompi_req); - request->status = HCOLRTE_REQUEST_DONE; - } - - return HCOLL_SUCCESS; -} - -static int ec_handle_compare( rte_ec_handle_t handle_1 , - rte_grp_handle_t - group_handle_1 , - rte_ec_handle_t handle_2 , - rte_grp_handle_t - group_handle_2 ) -{ - return handle_1.handle == handle_2.handle; -} - -static int get_ec_handles( int num_ec , - int * ec_indexes , - rte_grp_handle_t grp_h, - rte_ec_handle_t * ec_handles ) -{ - int i; - ompi_communicator_t *comm = (ompi_communicator_t *)grp_h; - for (i=0; ihandle = (void *)my_proc; - ec_handle->rank = my_rank; - return HCOLL_SUCCESS; -} -#endif - -static int group_size ( rte_grp_handle_t grp_h ) -{ - return ompi_comm_size((ompi_communicator_t *)grp_h); -} - -static int my_rank (rte_grp_handle_t grp_h ) -{ - return ompi_comm_rank((ompi_communicator_t *)grp_h); -} - -static int ec_on_local_node (rte_ec_handle_t ec, rte_grp_handle_t group){ - ompi_proc_t *proc = (ompi_proc_t *)ec.handle; - return OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags); -} - - -static rte_grp_handle_t get_world_group_handle(void) -{ - return (rte_grp_handle_t)&ompi_mpi_comm_world.comm; -} - -static uint32_t jobid(void){ - return OMPI_PROC_MY_NAME->jobid; -} - -static int group_id(rte_grp_handle_t group){ - return ((ompi_communicator_t *)group)->c_index; -} - -static int -request_free(struct ompi_request_t **ompi_req) -{ - ompi_request_t *req = *ompi_req; - if (!coll_handle_test(req)) { - return OMPI_ERROR; - } - coll_handle_free(req); - *ompi_req = MPI_REQUEST_NULL; - return OMPI_SUCCESS; -} - -static void* get_coll_handle(void) -{ - ompi_coll_base_nbc_request_t *ompi_req; - opal_free_list_item_t *item; - item = opal_free_list_wait (&(mca_coll_hcoll_component.requests)); - if (OPAL_UNLIKELY(NULL == item)) { - HCOL_ERROR("Wait for free list failed.\n"); - return NULL; - } - ompi_req = (ompi_coll_base_nbc_request_t *)item; - OMPI_REQUEST_INIT(&ompi_req->super,false); - ompi_req->super.req_complete_cb = NULL; - ompi_req->super.req_complete_cb_data = NULL; - ompi_req->super.req_status.MPI_ERROR = MPI_SUCCESS; - ompi_req->super.req_state = OMPI_REQUEST_ACTIVE; - ompi_req->super.req_free = request_free; - ompi_req->super.req_type = OMPI_REQUEST_COLL; - ompi_req->data.refcounted.objs.objs[0] = NULL; - ompi_req->data.refcounted.objs.objs[1] = NULL; - return (void *)ompi_req; -} - -static int coll_handle_test(void* handle) -{ - ompi_request_t *ompi_req = (ompi_request_t *)handle; - return REQUEST_COMPLETE(ompi_req);; -} - -static void coll_handle_free(void *handle){ - ompi_request_t *ompi_req = (ompi_request_t *)handle; - opal_free_list_return (&mca_coll_hcoll_component.requests, - (opal_free_list_item_t *)ompi_req); -} - -static void coll_handle_complete(void *handle) -{ - ompi_request_t *ompi_req = (ompi_request_t *)handle; - ompi_request_complete(ompi_req,true); -} - - -static int world_rank(rte_grp_handle_t grp_h, rte_ec_handle_t ec){ - ompi_proc_t *proc = (ompi_proc_t *)ec.handle; - return ((ompi_process_name_t*)&proc->super.proc_name)->vpid; -} - -#if HCOLL_API >= HCOLL_VERSION(3,6) -hcoll_mpi_type_combiner_t ompi_combiner_2_hcoll_combiner(int ompi_combiner) { - switch (ompi_combiner) - { - case MPI_COMBINER_CONTIGUOUS: - return HCOLL_MPI_COMBINER_CONTIGUOUS; - case MPI_COMBINER_VECTOR: - return HCOLL_MPI_COMBINER_VECTOR; - case MPI_COMBINER_HVECTOR: - return HCOLL_MPI_COMBINER_HVECTOR; - case MPI_COMBINER_INDEXED: - return HCOLL_MPI_COMBINER_INDEXED; - case MPI_COMBINER_HINDEXED_INTEGER: - case MPI_COMBINER_HINDEXED: - return HCOLL_MPI_COMBINER_HINDEXED; - case MPI_COMBINER_DUP: - return HCOLL_MPI_COMBINER_DUP; - case MPI_COMBINER_INDEXED_BLOCK: - return HCOLL_MPI_COMBINER_INDEXED_BLOCK; - case MPI_COMBINER_HINDEXED_BLOCK: - return HCOLL_MPI_COMBINER_HINDEXED_BLOCK; - case MPI_COMBINER_SUBARRAY: - return HCOLL_MPI_COMBINER_SUBARRAY; - case MPI_COMBINER_DARRAY: - return HCOLL_MPI_COMBINER_DARRAY; - case MPI_COMBINER_F90_REAL: - return HCOLL_MPI_COMBINER_F90_REAL; - case MPI_COMBINER_F90_COMPLEX: - return HCOLL_MPI_COMBINER_F90_COMPLEX; - case MPI_COMBINER_F90_INTEGER: - return HCOLL_MPI_COMBINER_F90_INTEGER; - case MPI_COMBINER_RESIZED: - return HCOLL_MPI_COMBINER_RESIZED; - case MPI_COMBINER_STRUCT: - case MPI_COMBINER_STRUCT_INTEGER: - return HCOLL_MPI_COMBINER_STRUCT; - default: - break; - } - return HCOLL_MPI_COMBINER_LAST; -} - - -static int get_mpi_type_envelope(void *mpi_type, int *num_integers, - int *num_addresses, int *num_datatypes, - hcoll_mpi_type_combiner_t *combiner) { - int ompi_combiner, rc; - rc = ompi_datatype_get_args( (ompi_datatype_t*)mpi_type, 0, num_integers, NULL, - num_addresses, NULL, - num_datatypes, NULL, &ompi_combiner); - *combiner = ompi_combiner_2_hcoll_combiner(ompi_combiner); - return rc == OMPI_SUCCESS ? HCOLL_SUCCESS : HCOLL_ERROR; -} - -static int get_mpi_type_contents(void *mpi_type, int max_integers, int max_addresses, - int max_datatypes, int *array_of_integers, - void *array_of_addresses, void *array_of_datatypes) { - int rc; - rc = ompi_datatype_get_args( (ompi_datatype_t*)mpi_type, 1, &max_integers, array_of_integers, - &max_addresses, array_of_addresses, - &max_datatypes, array_of_datatypes, NULL ); - return rc == OMPI_SUCCESS ? HCOLL_SUCCESS : HCOLL_ERROR; -} - -static int get_hcoll_type(void *mpi_type, dte_data_representation_t *hcoll_type) { - *hcoll_type = ompi_dtype_2_hcoll_dtype((ompi_datatype_t*)mpi_type, TRY_FIND_DERIVED); - return HCOL_DTE_IS_ZERO((*hcoll_type)) ? HCOLL_ERR_NOT_FOUND : HCOLL_SUCCESS; -} - -static int set_hcoll_type(void *mpi_type, dte_data_representation_t hcoll_type) { - int rc; - mca_coll_hcoll_dtype_t *hcoll_dtype = (mca_coll_hcoll_dtype_t*) - opal_free_list_get(&mca_coll_hcoll_component.dtypes); - ompi_datatype_t *dtype = (ompi_datatype_t*)mpi_type; - hcoll_dtype->type = hcoll_type; - rc = ompi_attr_set_c(TYPE_ATTR, (void*)dtype, &(dtype->d_keyhash), hcoll_type_attr_keyval, (void *)hcoll_dtype, false); - if (OMPI_SUCCESS != rc) { - HCOL_VERBOSE(1,"hcoll ompi_attr_set_c failed for derived dtype"); - goto Cleanup; - } - return HCOLL_SUCCESS; -Cleanup: - opal_free_list_return(&mca_coll_hcoll_component.dtypes, - &hcoll_dtype->super); - return rc; -} - -static int get_mpi_constants(size_t *mpi_datatype_size, - int *mpi_order_c, int *mpi_order_fortran, - int *mpi_distribute_block, - int *mpi_distribute_cyclic, - int *mpi_distribute_none, - int *mpi_distribute_dflt_darg) { - *mpi_datatype_size = sizeof(MPI_Datatype); - *mpi_order_c = MPI_ORDER_C; - *mpi_order_fortran = MPI_ORDER_FORTRAN; - *mpi_distribute_block = MPI_DISTRIBUTE_BLOCK; - *mpi_distribute_cyclic = MPI_DISTRIBUTE_CYCLIC; - *mpi_distribute_none = MPI_DISTRIBUTE_NONE; - *mpi_distribute_dflt_darg = MPI_DISTRIBUTE_DFLT_DARG; - return HCOLL_SUCCESS; -} - -#endif diff --git a/ompi/mca/coll/hcoll/configure.m4 b/ompi/mca/coll/hcoll/configure.m4 deleted file mode 100644 index 3d2c2b3a581..00000000000 --- a/ompi/mca/coll/hcoll/configure.m4 +++ /dev/null @@ -1,38 +0,0 @@ -# -*- shell-script -*- -# -# -# Copyright (c) 2011 Mellanox Technologies. All rights reserved. -# Copyright (c) 2015 Research Organization for Information Science -# and Technology (RIST). All rights reserved. -# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - - -# MCA_coll_hcoll_CONFIG([action-if-can-compile], -# [action-if-cant-compile]) -# ------------------------------------------------ -AC_DEFUN([MCA_ompi_coll_hcoll_CONFIG],[ - AC_CONFIG_FILES([ompi/mca/coll/hcoll/Makefile]) - - OMPI_CHECK_HCOLL([coll_hcoll], - [coll_hcoll_happy="yes"], - [coll_hcoll_happy="no"]) - - AS_IF([test "$coll_hcoll_happy" = "yes"], - [coll_hcoll_WRAPPER_EXTRA_LDFLAGS="$coll_hcoll_LDFLAGS" - coll_hcoll_CPPFLAGS="$coll_hcoll_CPPFLAGS" - coll_hcoll_WRAPPER_EXTRA_LIBS="$coll_hcoll_LIBS" - $1], - [$2]) - - # substitute in the things needed to build hcoll - AC_SUBST([coll_hcoll_CPPFLAGS]) - AC_SUBST([coll_hcoll_LDFLAGS]) - AC_SUBST([coll_hcoll_LIBS]) -])dnl - diff --git a/ompi/mca/coll/hcoll/owner.txt b/ompi/mca/coll/hcoll/owner.txt deleted file mode 100644 index 8dacea65a6d..00000000000 --- a/ompi/mca/coll/hcoll/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: MELLANOX -status: active diff --git a/ompi/op/op.c b/ompi/op/op.c index c800dc0a1cb..11064ec6f59 100644 --- a/ompi/op/op.c +++ b/ompi/op/op.c @@ -284,10 +284,6 @@ int ompi_op_init(void) FLAGS, "MPI_NO_OP")) { return OMPI_ERROR; }else{ -/* This code is placed back here to support - * HCOL allreduce at the moment. It is a part of bgate repository only. This conflict with OMPI v1.7 - * is to be resolved some other way. - * */ ompi_mpi_op_null.op.op_type = OMPI_OP_NULL; ompi_mpi_op_max.op.op_type = OMPI_OP_MAX; ompi_mpi_op_min.op.op_type = OMPI_OP_MIN; diff --git a/oshmem/mca/scoll/basic/scoll_basic.h b/oshmem/mca/scoll/basic/scoll_basic.h index 73365c62b4e..77760bbae93 100644 --- a/oshmem/mca/scoll/basic/scoll_basic.h +++ b/oshmem/mca/scoll/basic/scoll_basic.h @@ -23,7 +23,7 @@ BEGIN_C_DECLS * In case of shmem, the implementation of broadcast doesn't require * each process to know message size ( just root should know). * It differs from other implementations, so it may cause problems if - * BCAST_FUNC is a callback to another implementation (e.g, fca, hcoll). + * BCAST_FUNC is a callback to another implementation (e.g, fca). * So we replace a callback (group->g_scoll.scoll_[func]) * with a corresponding basic function. */ From 42200272b8d533634e81688e42029c61728e539e Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Tue, 28 Oct 2025 14:06:17 -0600 Subject: [PATCH 08/51] fix empty status fields related to #13478 but without the controversial stuff. Signed-off-by: Howard Pritchard --- ompi/request/request.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ompi/request/request.c b/ompi/request/request.c index 3f72a23fde0..e816325cf5f 100644 --- a/ompi/request/request.c +++ b/ompi/request/request.c @@ -18,7 +18,7 @@ * Copyright (c) 2015-2024 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. - * Copyright (c) 2018 Triad National Security, LLC. All rights + * Copyright (c) 2018-2025 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. @@ -47,7 +47,14 @@ ompi_request_t ompi_request_empty = {{{{0}}}}; #if MPI_VERSION >= 4 ompi_request_t ompi_request_empty_send = {{{{0}}}}; #endif -ompi_status_public_t ompi_status_empty = {0}; +/* + * See section 3.7.3 of the MPI 1.3 (probably older as well) MPI standard + */ +ompi_status_public_t ompi_status_empty = {.MPI_TAG = MPI_ANY_TAG, + .MPI_SOURCE = MPI_ANY_SOURCE, + .MPI_ERROR = MPI_SUCCESS, + ._cancelled = 0, + ._ucount = 0UL}; ompi_request_fns_t ompi_request_functions = { ompi_request_default_test, ompi_request_default_test_any, From e3ad19f1ea695a4c01c525b3941948554486e9bd Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 29 Oct 2025 09:25:59 -0400 Subject: [PATCH 09/51] Revert "tuned: use tree instead of bruck at scale" This reverts commit 9bd775769b2d64286a50c768d3a543312de4aaa5. --- ompi/mca/coll/tuned/coll_tuned_decision_fixed.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index d210ff4412f..fa31aef1860 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -490,8 +490,14 @@ int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm, alg = 3; } else if (communicator_size < 256) { alg = 4; - } else { + } else if (communicator_size < 512) { + alg = 6; + } else if (communicator_size < 1024) { + alg = 4; + } else if (communicator_size < 4096) { alg = 6; + } else { + alg = 4; } return ompi_coll_tuned_barrier_intra_do_this (comm, module, From b840b3a68d22934b5bd53ad44ba4bd33f8044b10 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Thu, 30 Oct 2025 10:52:13 -0700 Subject: [PATCH 10/51] ompi-prte: advance to sha 5ad79eb Signed-off-by: Jessie Yang --- 3rd-party/prrte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rd-party/prrte b/3rd-party/prrte index 8d9c03932b2..5ad79eb2850 160000 --- a/3rd-party/prrte +++ b/3rd-party/prrte @@ -1 +1 @@ -Subproject commit 8d9c03932b23d749b9ed969479cd9be03f9240ef +Subproject commit 5ad79eb285023d1dcca472ccba9de5987b51cc27 From 4673f0793d3efe6cfb4ffac3df481aa5d28b12e7 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Sat, 25 Oct 2025 11:35:38 -0400 Subject: [PATCH 11/51] Replace sprintf with snprintf Mac OS clang warns that sprintf is deprecated. Replace it with snprintf. Signed-off-by: Joseph Schuchart --- ompi/communicator/comm_cid.c | 6 +-- ompi/debuggers/ompi_common_dll_defs.h | 2 +- ompi/instance/instance.c | 4 +- .../ftagree/coll_ftagree_earlyreturning.c | 4 +- ompi/mca/coll/ucc/coll_ucc_module.c | 6 +-- .../monitoring/common_monitoring_coll.c | 5 ++- .../hook/comm_method/hook_comm_method_fns.c | 42 ++++++++++--------- .../sharedfp/lockedfile/sharedfp_lockedfile.c | 2 +- opal/mca/btl/smcuda/btl_smcuda.c | 2 +- opal/mca/btl/smcuda/btl_smcuda_component.c | 5 ++- opal/mca/btl/tcp/btl_tcp_component.c | 8 ++-- opal/util/timings.h | 31 +++++++------- test/datatype/position.c | 4 +- test/simple/crisscross.c | 2 +- test/simple/no-disconnect.c | 2 +- test/simple/parallel_r64.c | 2 +- test/simple/parallel_r8.c | 2 +- test/simple/parallel_w64.c | 2 +- test/simple/parallel_w8.c | 2 +- test/support/support.h | 18 ++++---- 20 files changed, 78 insertions(+), 73 deletions(-) diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c index 8546b700401..22967d3dfb4 100644 --- a/ompi/communicator/comm_cid.c +++ b/ompi/communicator/comm_cid.c @@ -417,7 +417,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu char msg_string[1024]; switch (rc) { case PMIX_ERR_UNREACH: - sprintf(msg_string,"PMIx server unreachable"); + snprintf(msg_string, sizeof(msg_string), "PMIx server unreachable"); opal_show_help("help-comm.txt", "MPI function not supported", true, @@ -427,7 +427,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu rc = MPI_ERR_UNSUPPORTED_OPERATION; break; case PMIX_ERR_NOT_SUPPORTED: - sprintf(msg_string,"PMIx server does not support PMIx Group operations"); + snprintf(msg_string, sizeof(msg_string), "PMIx server does not support PMIx Group operations"); opal_show_help("help-comm.txt", "MPI function not supported", true, @@ -577,7 +577,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com functions but the pml does not support these functions so return not supported */ if (NULL == comm) { char msg_string[1024]; - sprintf(msg_string,"The PML being used - %s - does not support MPI sessions related features", + snprintf(msg_string, sizeof(msg_string), "The PML being used - %s - does not support MPI sessions related features", mca_pml_base_selected_component.pmlm_version.mca_component_name); opal_show_help("help-comm.txt", "MPI function not supported", diff --git a/ompi/debuggers/ompi_common_dll_defs.h b/ompi/debuggers/ompi_common_dll_defs.h index 5fe11d3986e..5e8fa9f814e 100644 --- a/ompi/debuggers/ompi_common_dll_defs.h +++ b/ompi/debuggers/ompi_common_dll_defs.h @@ -247,7 +247,7 @@ typedef struct int MPI_TAG; int MPI_ERROR; int _cancelled; - size_t _ucount; + int _ucount; } offset; } ompi_status_public_t; /* datatype structure */ diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c index 2596c1decf5..103358cb527 100644 --- a/ompi/instance/instance.c +++ b/ompi/instance/instance.c @@ -1292,7 +1292,7 @@ static int ompi_instance_group_pmix_pset (ompi_instance_t *instance, const char ret = MPI_ERR_ARG; /* pset_name not valid */ break; case PMIX_ERR_UNREACH: - sprintf(msg_string,"PMIx server unreachable"); + snprintf(msg_string, sizeof(msg_string), "PMIx server unreachable"); opal_show_help("help-comm.txt", "MPI function not supported", true, @@ -1301,7 +1301,7 @@ static int ompi_instance_group_pmix_pset (ompi_instance_t *instance, const char ret = MPI_ERR_UNSUPPORTED_OPERATION; break; case PMIX_ERR_NOT_SUPPORTED: - sprintf(msg_string,"PMIx server does not support PMIX_QUERY_PSET_MEMBERSHIP operation"); + snprintf(msg_string, sizeof(msg_string), "PMIx server does not support PMIX_QUERY_PSET_MEMBERSHIP operation"); opal_show_help("help-comm.txt", "MPI function not supported", true, diff --git a/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c b/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c index 6f954166006..9450c443349 100644 --- a/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c +++ b/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c @@ -497,7 +497,7 @@ static void era_debug_print_group(int lvl, ompi_group_t *group, ompi_communicato } s = 128 + n * 16; str = (char*)malloc(s); - sprintf(str, "Group of size %d. Ranks in %d.%d: (", n, comm->c_index, comm->c_epoch); + snprintf(str, s, "Group of size %d. Ranks in %d.%d: (", n, comm->c_index, comm->c_epoch); p = strlen(str); for(i = 0; i < n; i++) { snprintf(str + p, s - p, "%d%s", gra[i], i==n-1 ? "" : ", "); @@ -2285,7 +2285,7 @@ static void send_msg(ompi_communicator_t *comm, b++; } while(w < 256); if( strlen(strbytes) >= 252 ) { - sprintf(strbytes + 252, "..."); + snprintf(strbytes + 252, 256 - 252, "..."); } OPAL_OUTPUT_VERBOSE((30, ompi_ftmpi_output_handle, diff --git a/ompi/mca/coll/ucc/coll_ucc_module.c b/ompi/mca/coll/ucc/coll_ucc_module.c index 00b3f31ad73..dfa3c1cf96c 100644 --- a/ompi/mca/coll/ucc/coll_ucc_module.c +++ b/ompi/mca/coll/ucc/coll_ucc_module.c @@ -312,14 +312,14 @@ static int mca_coll_ucc_init_ctx(ompi_communicator_t* comm) goto cleanup_lib; } - sprintf(str_buf, "%u", ompi_proc_world_size()); + snprintf(str_buf, sizeof(str_buf), "%u", ompi_proc_world_size()); if (UCC_OK != ucc_context_config_modify(ctx_config, NULL, "ESTIMATED_NUM_EPS", str_buf)) { UCC_ERROR("UCC context config modify failed for estimated_num_eps"); goto cleanup_lib; } - sprintf(str_buf, "%u", opal_process_info.num_local_peers + 1); + snprintf(str_buf, sizeof(str_buf), "%u", opal_process_info.num_local_peers + 1); if (UCC_OK != ucc_context_config_modify(ctx_config, NULL, "ESTIMATED_NUM_PPN", str_buf)) { UCC_ERROR("UCC context config modify failed for estimated_num_eps"); @@ -327,7 +327,7 @@ static int mca_coll_ucc_init_ctx(ompi_communicator_t* comm) } if (ucc_api_major > 1 || (ucc_api_major == 1 && ucc_api_minor >= 6)) { - sprintf(str_buf, "%u", opal_process_info.my_local_rank); + snprintf(str_buf, sizeof(str_buf), "%u", opal_process_info.my_local_rank); if (UCC_OK != ucc_context_config_modify(ctx_config, NULL, "NODE_LOCAL_ID", str_buf)) { UCC_ERROR("UCC context config modify failed for node_local_id"); diff --git a/ompi/mca/common/monitoring/common_monitoring_coll.c b/ompi/mca/common/monitoring/common_monitoring_coll.c index a7fa7bdb0df..b1255502f49 100644 --- a/ompi/mca/common/monitoring/common_monitoring_coll.c +++ b/ompi/mca/common/monitoring/common_monitoring_coll.c @@ -70,7 +70,8 @@ static inline void mca_common_monitoring_coll_cache(mca_monitoring_coll_data_t*d assert( 0 < size ); /* Allocate enough space for list (add 1 to keep the final '\0' if already exact size) */ max_length = snprintf(NULL, 0, "%d,", world_size - 1) + 1; - tmp_procs = malloc((1 + max_length * size) * sizeof(char)); + int bufsize = (1 + max_length * size) * sizeof(char); + tmp_procs = malloc(bufsize); if( NULL == tmp_procs ) { OPAL_MONITORING_PRINT_ERR("Cannot allocate memory for caching proc list."); } else { @@ -78,7 +79,7 @@ static inline void mca_common_monitoring_coll_cache(mca_monitoring_coll_data_t*d /* Build procs list */ for(i = 0; i < size; ++i) { if( OPAL_SUCCESS == mca_common_monitoring_get_world_rank(i, data->p_comm->c_remote_group, &world_rank) ) - pos += sprintf(&tmp_procs[pos], "%d,", world_rank); + pos += snprintf(&tmp_procs[pos], bufsize - pos, "%d,", world_rank); } tmp_procs[pos - 1] = '\0'; /* Remove final coma */ data->procs = realloc(tmp_procs, pos * sizeof(char)); /* Adjust to size required */ diff --git a/ompi/mca/hook/comm_method/hook_comm_method_fns.c b/ompi/mca/hook/comm_method/hook_comm_method_fns.c index b1ab8c200b3..5570a292ef4 100644 --- a/ompi/mca/hook/comm_method/hook_comm_method_fns.c +++ b/ompi/mca/hook/comm_method/hook_comm_method_fns.c @@ -325,9 +325,9 @@ abbreviate_list_into_string(char *str, int max, int *list, int nlist) strcpy(&str[strlen(str)], ", "); } if (lo != hi) { - sprintf(&str[strlen(str)], "%d - %d", lo, hi); + snprintf(&str[strlen(str)], max - strlen(str), "%d - %d", lo, hi); } else { - sprintf(&str[strlen(str)], "%d", lo); + snprintf(&str[strlen(str)], max - strlen(str), "%d", lo); } } /* @@ -352,9 +352,9 @@ abbreviate_list_into_string(char *str, int max, int *list, int nlist) strcpy(&str[strlen(str)], ", "); } if (lo != hi) { - sprintf(&str[strlen(str)], "%d - %d", lo, hi); + snprintf(&str[strlen(str)], max - strlen(str), "%d - %d", lo, hi); } else { - sprintf(&str[strlen(str)], "%d", lo); + snprintf(&str[strlen(str)], max - strlen(str), "%d", lo); } } } @@ -460,7 +460,7 @@ ompi_report_comm_methods(int called_from_location) len = strlen(opal_process_info.nodename) + 100; hoststring = malloc(len + 1); - sprintf(hoststring, "Host %d [%s] ranks ", + snprintf(hoststring, len + 1, "Host %d [%s] ranks ", myleaderrank, opal_process_info.nodename); abbreviate_list_into_string(&hoststring[strlen(hoststring)], @@ -548,7 +548,7 @@ ompi_report_comm_methods(int called_from_location) ompi_count_array_t lens_desc; ompi_disp_array_t disps_desc; - // First get the array of host strings (host names and task lists) + // First get the array of host strings (host names and task lists) // for all nodes. len = strlen(hoststring) + 1; if (myleaderrank == 0) { @@ -642,7 +642,7 @@ ompi_report_comm_methods(int called_from_location) // 2: 2d table if (nleaderranks <= max2Dprottable) { char *str, *p; - int tmp, per, has_ucx_transport; + int tmp, per, has_ucx_transport, bufsize; int strlens[NUM_COMM_METHODS]; // characters per entry in the 2d table, must be large enough @@ -668,11 +668,11 @@ ompi_report_comm_methods(int called_from_location) if (tmp+1 > per) { per = tmp+1; } } } - - str = malloc(nleaderranks * per + 1); + bufsize = nleaderranks * per + 1; + str = malloc(bufsize); p = str; for (i=0; i 0) { // if (!first) { // strcat(str, " /"); // } - sprintf(&str[strlen(str)], + snprintf(&str[strlen(str)], + 1024 - strlen(str), " [%dx %s]", method_count[k], comm_method_to_string(k)); diff --git a/ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile.c b/ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile.c index cbeedd12c34..bfbc940ae11 100644 --- a/ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile.c +++ b/ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile.c @@ -117,7 +117,7 @@ struct mca_sharedfp_base_module_2_0_0_t * mca_sharedfp_lockedfile_component_file /* Set the filename. */ /*data filename created by appending .locktest.$rank to the original filename*/ - sprintf(filename,"%s%s%d",fh->f_filename,".locktest.",rank); + snprintf(filename, sizeof(filename), "%s%s%d",fh->f_filename,".locktest.",rank); lock.l_type = F_WRLCK; lock.l_start = 0; diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c index c4389d422f2..1ce2b966ece 100644 --- a/opal/mca/btl/smcuda/btl_smcuda.c +++ b/opal/mca/btl/smcuda/btl_smcuda.c @@ -486,7 +486,7 @@ static struct mca_btl_base_endpoint_t *create_sm_endpoint(int local_proc, struct OBJ_CONSTRUCT(&ep->pending_sends, opal_list_t); OBJ_CONSTRUCT(&ep->endpoint_lock, opal_mutex_t); #if OPAL_ENABLE_PROGRESS_THREADS == 1 - sprintf(path, "%s" OPAL_PATH_SEP "sm_fifo.%lu", opal_process_info.job_session_dir, + snprintf(path, sizeof(path), "%s" OPAL_PATH_SEP "sm_fifo.%lu", opal_process_info.job_session_dir, (unsigned long) proc->proc_name); ep->fifo_fd = open(path, O_WRONLY); if (ep->fifo_fd < 0) { diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c index 72b75d67311..8f633d6b48a 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_component.c +++ b/opal/mca/btl/smcuda/btl_smcuda_component.c @@ -859,8 +859,9 @@ mca_btl_smcuda_component_init(int *num_btls, bool enable_progress_threads, bool #if OPAL_ENABLE_PROGRESS_THREADS == 1 /* create a named pipe to receive events */ - sprintf(mca_btl_smcuda_component.sm_fifo_path, "%s" OPAL_PATH_SEP "sm_fifo.%lu", - opal_process_info.job_session_dir, (unsigned long) OPAL_PROC_MY_NAME->vpid); + snprintf(mca_btl_smcuda_component.sm_fifo_path, sizeof(mca_btl_smcuda_component.sm_fifo_path), + "%s" OPAL_PATH_SEP "sm_fifo.%lu", + opal_process_info.job_session_dir, (unsigned long) OPAL_PROC_MY_NAME->vpid); if (mkfifo(mca_btl_smcuda_component.sm_fifo_path, 0660) < 0) { opal_output(0, "mca_btl_smcuda_component_init: mkfifo failed with errno=%d\n", errno); return NULL; diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c index 39715729c96..41d8a5c162b 100644 --- a/opal/mca/btl/tcp/btl_tcp_component.c +++ b/opal/mca/btl/tcp/btl_tcp_component.c @@ -562,12 +562,12 @@ static int mca_btl_tcp_create(const int if_kindex, const char *if_name) btl->tcp_ifmask = selected_interface->if_mask; /* allow user to specify interface bandwidth */ - sprintf(param, "bandwidth_%s", if_name); + snprintf(param, sizeof(param), "bandwidth_%s", if_name); mca_btl_tcp_param_register_uint(param, NULL, btl->super.btl_bandwidth, OPAL_INFO_LVL_5, &btl->super.btl_bandwidth); /* allow user to override/specify latency ranking */ - sprintf(param, "latency_%s", if_name); + snprintf(param, sizeof(param), "latency_%s", if_name); mca_btl_tcp_param_register_uint(param, NULL, btl->super.btl_latency, OPAL_INFO_LVL_5, &btl->super.btl_latency); if (i > 0) { @@ -576,12 +576,12 @@ static int mca_btl_tcp_create(const int if_kindex, const char *if_name) } /* allow user to specify interface bandwidth */ - sprintf(param, "bandwidth_%s:%d", if_name, i); + snprintf(param, sizeof(param), "bandwidth_%s:%d", if_name, i); mca_btl_tcp_param_register_uint(param, NULL, btl->super.btl_bandwidth, OPAL_INFO_LVL_5, &btl->super.btl_bandwidth); /* allow user to override/specify latency ranking */ - sprintf(param, "latency_%s:%d", if_name, i); + snprintf(param, sizeof(param), "latency_%s:%d", if_name, i); mca_btl_tcp_param_register_uint(param, NULL, btl->super.btl_latency, OPAL_INFO_LVL_5, &btl->super.btl_latency); diff --git a/opal/util/timings.h b/opal/util/timings.h index 6dfc2ef6440..76aa7373124 100644 --- a/opal/util/timings.h +++ b/opal/util/timings.h @@ -60,7 +60,8 @@ void opal_timing_disable_native_timers(void); if (n > OPAL_TIMING_STR_LEN) { \ (_nm)->error = 1; \ } \ - n = sprintf((_nm)->cntr_env, "OMPI_TIMING_%s_CNT", (_nm)->id); \ + n = snprintf((_nm)->cntr_env, OPAL_TIMING_STR_LEN, \ + "OMPI_TIMING_%s_CNT", (_nm)->id); \ if (n > OPAL_TIMING_STR_LEN) { \ (_nm)->error = 1; \ } \ @@ -135,7 +136,7 @@ void opal_timing_disable_native_timers(void); } \ setenv(buf1, buf2, 1); \ h->cntr++; \ - sprintf(buf1, "%d", h->cntr); \ + snprintf(buf1, OPAL_TIMING_STR_LEN, "%d", h->cntr); \ setenv(h->cntr_env, buf1, 1); \ /* We don't include env operations into the consideration. \ * Hopefully this will help to make measurements more accurate. \ @@ -187,19 +188,19 @@ void opal_timing_disable_native_timers(void); } \ } while (0) -# define OPAL_TIMING_ENV_GETDESC_PREFIX(prefix, filename, func, i, desc, _t) \ - do { \ - char vname[OPAL_TIMING_STR_LEN]; \ - (_t) = 0.0; \ - sprintf(vname, "OMPI_TIMING_%s_%s_FILE_%d", prefix, func, i); \ - *filename = getenv(vname); \ - sprintf(vname, "OMPI_TIMING_%s_%s_DESC_%d", prefix, func, i); \ - *desc = getenv(vname); \ - sprintf(vname, "OMPI_TIMING_%s_%s_VAL_%d", prefix, func, i); \ - char *ptr = getenv(vname); \ - if (NULL != ptr) { \ - sscanf(ptr, "%lf", &(_t)); \ - } \ +# define OPAL_TIMING_ENV_GETDESC_PREFIX(prefix, filename, func, i, desc, _t) \ + do { \ + char vname[OPAL_TIMING_STR_LEN]; \ + (_t) = 0.0; \ + snprintf(vname, OPAL_TIMING_STR_LEN, "OMPI_TIMING_%s_%s_FILE_%d", prefix, func, i); \ + *filename = getenv(vname); \ + snprintf(vname, OPAL_TIMING_STR_LEN, "OMPI_TIMING_%s_%s_DESC_%d", prefix, func, i); \ + *desc = getenv(vname); \ + snprintf(vname, OPAL_TIMING_STR_LEN, "OMPI_TIMING_%s_%s_VAL_%d", prefix, func, i); \ + char *ptr = getenv(vname); \ + if (NULL != ptr) { \ + sscanf(ptr, "%lf", &(_t)); \ + } \ } while (0) # define OPAL_TIMING_ENV_GETDESC(file, func, index, desc) \ diff --git a/test/datatype/position.c b/test/datatype/position.c index 94809e07a10..bd4f2834833 100644 --- a/test/datatype/position.c +++ b/test/datatype/position.c @@ -201,9 +201,9 @@ static char *bytes_dump(void *src, size_t cnt) static char text[1024]; int index, i; - index = sprintf(text, "0x"); + index = snprintf(text, sizeof(text), "0x"); for (i = 0; i < (int) cnt; i++) - index += sprintf(text + index, "%x", (int) (((char *) src)[i])); + index += snprintf(text + index, sizeof(text) - index, "%x", (int) (((char *) src)[i])); *(text + index) = '\0'; return text; } diff --git a/test/simple/crisscross.c b/test/simple/crisscross.c index 0f2f544ebb0..36378d49963 100644 --- a/test/simple/crisscross.c +++ b/test/simple/crisscross.c @@ -92,7 +92,7 @@ int main(int argc, char *argv[]) mpierr = MPI_Get_processor_name(process_name, &count); if (mpierr != MPI_SUCCESS) { fprintf(stderr, "MPI Error %d (MPI_Get_processor_name) [%d]\n", mpierr, rank); - sprintf(process_name, "%s", rr_empty); + snprintf(process_name, sizeof(process_name), "%s", rr_empty); } else { if (count < MAX_RR_NAME) strncat(&process_name[count], rr_blank, MAX_RR_NAME - count); diff --git a/test/simple/no-disconnect.c b/test/simple/no-disconnect.c index d493a1bd946..6c3c6796a7a 100644 --- a/test/simple/no-disconnect.c +++ b/test/simple/no-disconnect.c @@ -162,7 +162,7 @@ int main(int argc, char **argv) printf("level = %d\n", level); /* prepare send buffer */ - sprintf(bufs, "level %d (pid:%d)", level, getpid()); + snprintf(bufs, sizeof(bufs), "level %d (pid:%d)", level, getpid()); /* spawn */ if (level < max_depth) { diff --git a/test/simple/parallel_r64.c b/test/simple/parallel_r64.c index 7ca5219417f..31460dd924f 100644 --- a/test/simple/parallel_r64.c +++ b/test/simple/parallel_r64.c @@ -78,7 +78,7 @@ int main(int argc, char *argv[]) } } if (MPI_Get_processor_name(process_name, &count) != MPI_SUCCESS) { - sprintf(process_name, "%s", rr_empty); + snprintf(process_name, sizeof(process_name), "%s", rr_empty); } else { if (count < MAX_RR_NAME) strncat(&process_name[count], rr_blank, MAX_RR_NAME - count); diff --git a/test/simple/parallel_r8.c b/test/simple/parallel_r8.c index b8541e17ed5..239ba26a574 100644 --- a/test/simple/parallel_r8.c +++ b/test/simple/parallel_r8.c @@ -78,7 +78,7 @@ int main(int argc, char *argv[]) } } if (MPI_Get_processor_name(process_name, &count) != MPI_SUCCESS) { - sprintf(process_name, "%s", rr_empty); + snprintf(process_name, sizeof(process_name), "%s", rr_empty); } else { if (count < MAX_RR_NAME) strncat(&process_name[count], rr_blank, MAX_RR_NAME - count); diff --git a/test/simple/parallel_w64.c b/test/simple/parallel_w64.c index fdb9acdd29c..90210ca9ff7 100644 --- a/test/simple/parallel_w64.c +++ b/test/simple/parallel_w64.c @@ -78,7 +78,7 @@ int main(int argc, char *argv[]) } } if (MPI_Get_processor_name(process_name, &count) != MPI_SUCCESS) { - sprintf(process_name, "%s", rr_empty); + snprintf(process_name, sizeof(process_name), "%s", rr_empty); } else { if (count < MAX_RR_NAME) strncat(&process_name[count], rr_blank, MAX_RR_NAME - count); diff --git a/test/simple/parallel_w8.c b/test/simple/parallel_w8.c index 87eb41f6968..1b933c75b68 100644 --- a/test/simple/parallel_w8.c +++ b/test/simple/parallel_w8.c @@ -77,7 +77,7 @@ int main(int argc, char *argv[]) } } if (MPI_Get_processor_name(process_name, &count) != MPI_SUCCESS) { - sprintf(process_name, "%s", rr_empty); + snprintf(process_name, sizeof(process_name), "%s", rr_empty); } else { if (count < MAX_RR_NAME) strncat(&process_name[count], rr_blank, MAX_RR_NAME - count); diff --git a/test/support/support.h b/test/support/support.h index a7249ff535b..8c2062ea978 100644 --- a/test/support/support.h +++ b/test/support/support.h @@ -47,15 +47,15 @@ void test_fail_stop(const char *msg, int status); * test_verify: Non-fatal assertion macro. */ -#define test_verify(MESSAGE, EXPR) \ - do { \ - if (!(EXPR)) { \ - char s[256]; \ - sprintf(s, "%s:%d: %s: %s\n", __FILE__, __LINE__, MESSAGE, #EXPR); \ - test_failure(s); \ - } else { \ - test_success(); \ - } \ +#define test_verify(MESSAGE, EXPR) \ + do { \ + if (!(EXPR)) { \ + char s[256]; \ + snprintf(s, sizeof(s), "%s:%d: %s: %s\n", __FILE__, __LINE__, MESSAGE, #EXPR); \ + test_failure(s); \ + } else { \ + test_success(); \ + } \ } while (0) #endif /* OMPI_SUPPORT_H */ From 2b56685bb7e1b060cad2c5744679fb3c8623ccc2 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Sat, 25 Oct 2025 11:36:32 -0400 Subject: [PATCH 12/51] acoll: remove unused module variables Signed-off-by: Joseph Schuchart --- ompi/mca/coll/acoll/coll_acoll_barrier.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/ompi/mca/coll/acoll/coll_acoll_barrier.c b/ompi/mca/coll/acoll/coll_acoll_barrier.c index 2398053191b..02f3a6500cb 100644 --- a/ompi/mca/coll/acoll/coll_acoll_barrier.c +++ b/ompi/mca/coll/acoll/coll_acoll_barrier.c @@ -119,7 +119,6 @@ int mca_coll_acoll_barrier_shm_h(struct ompi_communicator_t *comm, mca_coll_base int root = 0; int rank = ompi_comm_rank(comm); int size = ompi_comm_size(comm); - mca_coll_acoll_module_t *acoll_module = (mca_coll_acoll_module_t *) module; coll_acoll_init(module, comm, subc->data, subc, root); coll_acoll_data_t *data = subc->data; @@ -227,7 +226,6 @@ int mca_coll_acoll_barrier_shm_f(struct ompi_communicator_t *comm, mca_coll_base int root = 0; int rank = ompi_comm_rank(comm); int size = ompi_comm_size(comm); - mca_coll_acoll_module_t *acoll_module = (mca_coll_acoll_module_t *) module; coll_acoll_init(module, comm, subc->data, subc, root); coll_acoll_data_t *data = subc->data; From 4bfe98eab2b5da1cd2b3ee18b8f1d3a69aaa9694 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Sat, 25 Oct 2025 11:37:20 -0400 Subject: [PATCH 13/51] comm_cid: initialize remote_cid64 Clang warns about possible uninitialized use. Signed-off-by: Joseph Schuchart --- ompi/communicator/comm_cid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c index 22967d3dfb4..5029e43da73 100644 --- a/ompi/communicator/comm_cid.c +++ b/ompi/communicator/comm_cid.c @@ -1066,7 +1066,7 @@ int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t *comm, int dest, uin pmix_value_t *val = NULL; ompi_comm_extended_cid_t excid; int rc = OMPI_SUCCESS; - size_t remote_cid64; + size_t remote_cid64 = 0; assert(NULL != remote_cid); From ef93ff4bd8ae686d7842e4fb8a86dc7c0d2ba943 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Sat, 25 Oct 2025 11:37:41 -0400 Subject: [PATCH 14/51] comm_cid: fix printf warnings Signed-off-by: Joseph Schuchart --- ompi/communicator/comm_cid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c index 5029e43da73..be99de913ab 100644 --- a/ompi/communicator/comm_cid.c +++ b/ompi/communicator/comm_cid.c @@ -1082,7 +1082,7 @@ int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t *comm, int dest, uin PMIX_INFO_LOAD(&tinfo[1], PMIX_GROUP_CONTEXT_ID, &excid.cid_base, PMIX_SIZE); PMIX_INFO_SET_QUALIFIER(&tinfo[1]); if (PMIX_SUCCESS != (rc = PMIx_Get(&pmix_proc, PMIX_GROUP_LOCAL_CID, tinfo, 2, &val))) { - OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %ld %s", excid.cid_base, PMIx_Error_string(rc))); + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %"PRIu64" %s", excid.cid_base, PMIx_Error_string(rc))); rc = OMPI_ERR_NOT_FOUND; goto done; } @@ -1103,7 +1103,7 @@ int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t *comm, int dest, uin rc = OMPI_SUCCESS; *remote_cid = (uint32_t)remote_cid64; comm->c_index_vec[dest] = (uint32_t)remote_cid64; - OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %ld", *remote_cid, excid.cid_base)); + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %"PRIu64, *remote_cid, excid.cid_base)); done: if (NULL != val) { From 588ae908fbfcb50531f20fecc17802451db82d25 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Sat, 25 Oct 2025 19:10:12 -0400 Subject: [PATCH 15/51] reduce-local: add parans around min/max macro The expression `max_k < min(a, b)` may not produce what we think it does. Signed-off-by: Joseph Schuchart --- test/datatype/reduce_local.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c index 9f1c06d4ba0..42fdb3e05c3 100644 --- a/test/datatype/reduce_local.c +++ b/test/datatype/reduce_local.c @@ -59,9 +59,9 @@ static int do_ops[12] = { static int verbose = 0; static int total_errors = 0; -#define max(a, b) (a) > (b) ? (a) : (b) +#define max(a, b) ((a) > (b) ? (a) : (b)) -#define min(a, b) (a) < (b) ? (a) : (b) +#define min(a, b) ((a) < (b) ? (a) : (b)) static void print_status(char *op, char *type, int type_size, int count, int max_shift, double *duration, int repeats, int correct) From 4a41be842fbaaa9cec82192f3fec41cbb37d6a5a Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Mon, 10 Nov 2025 10:54:50 -0700 Subject: [PATCH 16/51] group_from_session_pset - remove confusing error message The implementation of MPI_Group_from_session_pset was emitting a confusing error message when querying the PMIx server for members of a process group. The routine was using a method for creating an error message that was intended to be using during mpi initialization. Remove that statement and let an error code be returned. Related to #13497 Signed-off-by: Howard Pritchard --- ompi/instance/instance.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c index 103358cb527..8ca19a9724c 100644 --- a/ompi/instance/instance.c +++ b/ompi/instance/instance.c @@ -1313,7 +1313,6 @@ static int ompi_instance_group_pmix_pset (ompi_instance_t *instance, const char ret = opal_pmix_convert_status(rc); break; } - ompi_instance_print_error ("PMIx_Query_info() failed", ret); goto fn_w_query; } @@ -1334,7 +1333,6 @@ static int ompi_instance_group_pmix_pset (ompi_instance_t *instance, const char if (OPAL_SUCCESS == rc) { group->grp_proc_pointers[i] = ompi_proc_find_and_add(&pname,&isnew); } else { - ompi_instance_print_error ("OPAL_PMIX_CONVERT_PROCT failed %d", ret); ompi_group_free(&group); goto fn_w_info; } From 091d788ff675d980f69714428a06b3fe5a0dd49d Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Tue, 11 Nov 2025 20:46:04 +0000 Subject: [PATCH 17/51] adjust hdf5 workflow script hdf5-2.0 was released, which was breaking our github workflow for hdf5. However, the new 2.0 release of hdf5 changed the build system to CMake instead of configure/make. Will investigate at a later stage how to adjust the CI workflow script for hdf5-2.0, for now just hard code the last release of hdf5-1.14.x Signed-off-by: Edgar Gabriel --- .github/workflows/hdf5-tests.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/hdf5-tests.yaml b/.github/workflows/hdf5-tests.yaml index 42417033a88..82f8dbdcdb6 100644 --- a/.github/workflows/hdf5-tests.yaml +++ b/.github/workflows/hdf5-tests.yaml @@ -20,9 +20,9 @@ jobs: make -j 8 && make install - name: Install HDF5 run: | - wget --progress=dot:giga https://github.com/HDFGroup/hdf5/releases/latest/download/hdf5.tar.gz - tar -xzf hdf5.tar.gz - mv hdf5-1* hdf5 + wget --progress=dot:giga https://github.com/HDFGroup/hdf5/releases/download/hdf5_1.14.6/hdf5-1.14.6.tar.gz + tar -xzf hdf5-1.14.6.tar.gz + mv hdf5-1.14.6 hdf5 cd hdf5 export PATH=/opt/openmpi/bin:${PATH} export LD_LIBRARY_PATH=/opt/openmpi/lib:${LD_LIBRARY_PATH} From 3571f8ca0b9a6f8757927f3ba71581cd306edad0 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 10 Nov 2025 14:18:53 -0600 Subject: [PATCH 18/51] Always populate failed procs in comms' groups Signed-off-by: Matthew Whitlock --- ompi/communicator/ft/comm_ft.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ompi/communicator/ft/comm_ft.c b/ompi/communicator/ft/comm_ft.c index bd2627c778c..429d574bbed 100644 --- a/ompi/communicator/ft/comm_ft.c +++ b/ompi/communicator/ft/comm_ft.c @@ -792,13 +792,12 @@ bool ompi_comm_is_proc_active(ompi_communicator_t *comm, int peer_id, bool remot int ompi_comm_set_rank_failed(ompi_communicator_t *comm, int peer_id, bool remote) { -#if OPAL_ENABLE_DEBUG /* populate the proc in the comm's group array so that it is not a sentinel and can be read as failed */ - ompi_proc_t *ompi_proc = ompi_group_get_proc_ptr((remote ? comm->c_remote_group : comm->c_local_group), - peer_id, true); + ompi_proc_t *ompi_proc __opal_attribute_unused__; + ompi_proc = ompi_group_get_proc_ptr((remote ? comm->c_remote_group : comm->c_local_group), + peer_id, true); assert(NULL != ompi_proc); -#endif /* Disable ANY_SOURCE */ comm->any_source_enabled = false; From aac16475a0d8de1e7c24e229db955a867418afe1 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 12 Nov 2025 09:48:21 -0700 Subject: [PATCH 19/51] btl: fix ompi_info for btl_flags The flag enumerator used for btl_flags was missing an entry for the MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION flags. If this flag was set on a btl then it would cause ompi_info to fail when attempting to get the value for btl_flags leading to it just being omited from the output (another bug since it should fail and print an error). Signed-off-by: Nathan Hjelm --- opal/mca/btl/base/btl_base_frame.c | 1 + opal/mca/btl/btl.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/opal/mca/btl/base/btl_base_frame.c b/opal/mca/btl/base/btl_base_frame.c index 4341d78f6ec..63fc0f7f75c 100644 --- a/opal/mca/btl/base/btl_base_frame.c +++ b/opal/mca/btl/base/btl_base_frame.c @@ -58,6 +58,7 @@ mca_base_var_enum_value_flag_t mca_btl_base_flag_enum_flags[] {MCA_BTL_FLAGS_PUT_AM, "put-am", MCA_BTL_FLAGS_PUT}, {MCA_BTL_FLAGS_GET_AM, "get_am", MCA_BTL_FLAGS_GET}, {MCA_BTL_FLAGS_ATOMIC_AM_FOP, "atomic-am", MCA_BTL_FLAGS_ATOMIC_FOPS}, + {MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION, "rdma-remote-completion", 0}, {0, NULL, 0}}; mca_base_var_enum_value_flag_t mca_btl_base_atomic_enum_flags[] diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h index c66ca1ad909..f6f97564cd5 100644 --- a/opal/mca/btl/btl.h +++ b/opal/mca/btl/btl.h @@ -273,6 +273,9 @@ typedef uint8_t mca_btl_base_tag_t; */ #define MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION 0x800000 +/* End of btl flags. if additional flags are added please update + * mca_btl_base_flag_enum_flags in btl_base_frame.c */ + /* Default exclusivity levels */ #define MCA_BTL_EXCLUSIVITY_HIGH (64 * 1024) /* internal loopback */ #define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */ From 756401574f9600513f7fcefea2cff5825bbd2671 Mon Sep 17 00:00:00 2001 From: Van Man NGUYEN Date: Thu, 13 Nov 2025 10:19:10 +0100 Subject: [PATCH 20/51] ubcl: removed unused cuda-related m4 macro Signed-off-by: Van Man NGUYEN --- ompi/mca/pml/ubcl/configure.m4 | 2 -- 1 file changed, 2 deletions(-) diff --git a/ompi/mca/pml/ubcl/configure.m4 b/ompi/mca/pml/ubcl/configure.m4 index c3159651a41..262de492c19 100644 --- a/ompi/mca/pml/ubcl/configure.m4 +++ b/ompi/mca/pml/ubcl/configure.m4 @@ -21,8 +21,6 @@ AC_DEFUN([MCA_ompi_pml_ubcl_CONFIG], [ AC_REQUIRE([MCA_ompi_common_ubcl_CONFIG]) AC_REQUIRE([MCA_opal_common_ubcl_CONFIG]) - AC_REQUIRE([OPAL_CHECK_CUDA]) - AC_REQUIRE([OPAL_CHECK_CUDART]) AS_IF([test "$pml_ubcl_happy" = "yes"], [$1], From 8cb3ba6aa072424dda3e9973713ec3316dabc5c9 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Wed, 19 Nov 2025 19:21:44 -0700 Subject: [PATCH 21/51] fortran:fix a problem with request_get_some it doesn't compile for integer(kind=8) default without this patch Signed-off-by: Howard Pritchard --- ompi/mpi/fortran/mpif-h/request_get_status_some_f.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ompi/mpi/fortran/mpif-h/request_get_status_some_f.c b/ompi/mpi/fortran/mpif-h/request_get_status_some_f.c index 72182798a9c..6cae95847e5 100644 --- a/ompi/mpi/fortran/mpif-h/request_get_status_some_f.c +++ b/ompi/mpi/fortran/mpif-h/request_get_status_some_f.c @@ -116,7 +116,7 @@ void ompi_request_get_status_some_f(MPI_Fint *incount, MPI_Fint *array_of_reques if (NULL != ierr) *ierr = OMPI_INT_2_FINT(c_ierr); if (MPI_SUCCESS == c_ierr) { - OMPI_SINGLE_INT_2_FINT(*outcount); + OMPI_SINGLE_INT_2_FINT(outcount); if (MPI_UNDEFINED != OMPI_FINT_2_INT(*outcount)) { OMPI_ARRAY_INT_2_FINT(array_of_indices, *outcount); From 112b4f813b82e354980c2c65474c350eb186a98a Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 27 Feb 2025 13:59:40 -0700 Subject: [PATCH 22/51] btl/uct: complete re-work of the BTL This commit is large and contains the following changes: - Disconnect the connection memory domain from the communication domain. This allows any memory domain to be used for connections. The default is to use tcp but it can be disabled which will allow UD and others to be used. - Move tl attributes off od the tl context structure. In theory tl attributes do not differ betweeen contexts so query them once when the tl is created not once per context. This removes the need to allocate the first context so that code has also been removed. - Change mca_btl_uct_tl_t uct_dev_contexts member to be an array. The btl always allocates the maximum number of contexts. This is not a significant amount of memory. Rather than reduce it to be based on the configured maximum number of contexts it makes sense to just make it an array and remove the extra indirection when accessing the contexts. - Do not call mca_btl_uct_endpoint_set_flag before sending a message on the connection endpoint. This method may cause the release of the connection endpoint (cached on the BTL endpoint). If this happens it would lead to a SEGV. - Flush the endpoint only when it is being released. There is no need to do so on every send. Releasing the endpoint without flushing it may lead to it being destroyed while still processing data. - Downgrade endpoint lock from recursive. Recursive locks are not needed for the endpoint lock. - Move the async context from the module to the tl. There is no real benefit from sharing the async context between tls. Given this and some other changes that will be made it makes sense to move it from the module to the tl. - Connection TLs are only used to form connections for connect-to-endpoint TLs. They do not need to belong to the same memory domain as the one they are used with so there is no need to rely on a BTL module. Moved the pending_connection_reqs to the tl and changes the code to support a NULL module for the connection tl. - Put active tls in a list on the mca_btl_uct_md_t structure This simplifies the code a bit by moving mca_btl_uct_tl_t ownership to the mca_btl_uct_md_t class. - There is an issue with btl/uct which prevents the usage of the standard btl_uct_ MCA variables (eager limit, flags, etc). Because of the way the btl was written these values are all determined directly from UCT and can not be changed using the MCA variable interface. To address this issue this commit breaks apart the initialization code and separates out the pieces that are necessary for discovery only. The discovery pieces now use a new set of variables that include the memory domain name and directly control the behavior for BTLs on that memory domain as well as enabling the usage of the btl_uct variable to control the defaults for these variables. Example, using memory domain irdma0 will create variables: btl_uct_irdma0_eager_limit, btl_uct_irdma0_max_send_size, etc. The defaults will be based on what is reported by UCT and the user can set the values to a subset of what UCT reports. For example, if the max send size for the hardware is 8192B then it can be set to anything up to and including that value. The same is true for feature flags, if the hardware supports only some btl atomics or operations the user can specify a subset of them (others will be ignored). - Move device context code to a new file. There is a specific header for device contexts so it makes sense to move the context-specific code to a matching C file. No changes in this other than moving code around. - Use uct_ep_am_short_iov for short messages. The uct_ep_am_short_iov method should allow for faster short messages than uct_ep_am_short (which can only take a single buffer). This commit moves btl/uct to the newer method which breaks compatibility with some version of UCT. Since we already no longer support those versions this change is safe. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/Makefile.am | 30 +- opal/mca/btl/uct/btl_uct.h | 64 ++- opal/mca/btl/uct/btl_uct_am.c | 69 ++- opal/mca/btl/uct/btl_uct_component.c | 494 +++++--------------- opal/mca/btl/uct/btl_uct_device_context.c | 154 +++++++ opal/mca/btl/uct/btl_uct_device_context.h | 13 +- opal/mca/btl/uct/btl_uct_discover.c | 522 ++++++++++++++++++++++ opal/mca/btl/uct/btl_uct_discover.h | 43 ++ opal/mca/btl/uct/btl_uct_endpoint.c | 322 ++++++------- opal/mca/btl/uct/btl_uct_frag.c | 22 +- opal/mca/btl/uct/btl_uct_include_list.c | 78 ++++ opal/mca/btl/uct/btl_uct_include_list.h | 34 ++ opal/mca/btl/uct/btl_uct_modex.c | 198 ++++++++ opal/mca/btl/uct/btl_uct_modex.h | 20 + opal/mca/btl/uct/btl_uct_module.c | 82 +++- opal/mca/btl/uct/btl_uct_rdma.c | 4 +- opal/mca/btl/uct/btl_uct_rdma.h | 4 +- opal/mca/btl/uct/btl_uct_tl.c | 379 ++++------------ opal/mca/btl/uct/btl_uct_types.h | 93 +++- 19 files changed, 1669 insertions(+), 956 deletions(-) create mode 100644 opal/mca/btl/uct/btl_uct_device_context.c create mode 100644 opal/mca/btl/uct/btl_uct_discover.c create mode 100644 opal/mca/btl/uct/btl_uct_discover.h create mode 100644 opal/mca/btl/uct/btl_uct_include_list.c create mode 100644 opal/mca/btl/uct/btl_uct_include_list.h create mode 100644 opal/mca/btl/uct/btl_uct_modex.c create mode 100644 opal/mca/btl/uct/btl_uct_modex.h diff --git a/opal/mca/btl/uct/Makefile.am b/opal/mca/btl/uct/Makefile.am index df548cc66ff..11799cfe3fe 100644 --- a/opal/mca/btl/uct/Makefile.am +++ b/opal/mca/btl/uct/Makefile.am @@ -13,6 +13,7 @@ # Copyright (c) 2017 IBM Corporation. All rights reserved. # Copyright (c) 2017-2018 Los Alamos National Security, LLC. All rights # reserved. +# Copyright (c) 2025 Google, LLC. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -24,22 +25,31 @@ AM_CPPFLAGS = $(btl_uct_CPPFLAGS) amca_paramdir = $(AMCA_PARAM_SETS_DIR) -sources = \ +headers = \ btl_uct.h \ + btl_uct_rdma.h \ + btl_uct_endpoint.h \ + btl_uct_am.h \ + btl_uct_frag.h \ + btl_uct_types.h \ + btl_uct_device_context.h \ + btl_uct_discover.h \ + btl_uct_modex.h \ + btl_uct_include_list.h + +sources = \ btl_uct_module.c \ btl_uct_component.c \ - btl_uct_rdma.h \ btl_uct_rdma.c \ - btl_uct_endpoint.h \ btl_uct_endpoint.c \ btl_uct_amo.c \ - btl_uct_am.h \ btl_uct_am.c \ - btl_uct_frag.h \ btl_uct_frag.c \ btl_uct_tl.c \ - btl_uct_types.h \ - btl_uct_device_context.h + btl_uct_discover.c \ + btl_uct_modex.c \ + btl_uct_include_list.c \ + btl_uct_device_context.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la @@ -50,20 +60,22 @@ lib = lib_sources = component = mca_btl_uct.la component_sources = $(sources) +component_headers = $(headers) else lib = libmca_btl_uct.la lib_sources = $(sources) +lib_headers = ${headers} component = component_sources = endif mcacomponentdir = $(opallibdir) mcacomponent_LTLIBRARIES = $(component) -mca_btl_uct_la_SOURCES = $(component_sources) +mca_btl_uct_la_SOURCES = $(component_sources) $(component_headers) mca_btl_uct_la_LDFLAGS = -module -avoid-version $(btl_uct_LDFLAGS) mca_btl_uct_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la $(btl_uct_LIBS) noinst_LTLIBRARIES = $(lib) -libmca_btl_uct_la_SOURCES = $(lib_sources) +libmca_btl_uct_la_SOURCES = $(lib_sources) $(lib_headers) libmca_btl_uct_la_LDFLAGS = -module -avoid-version $(btl_uct_LDFLAGS) libmca_btl_uct_la_LIBADD = $(btl_uct_LIBS) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index 65bc69fddb2..20b40783d46 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -64,6 +64,9 @@ struct mca_btl_uct_module_t { /** base BTL interface */ mca_btl_base_module_t super; + /** module index in the component module array */ + int module_index; + /** whether the module has been fully initialized or not */ bool initialized; @@ -76,31 +79,15 @@ struct mca_btl_uct_module_t { /** mutex to protect the module */ opal_recursive_mutex_t lock; - /** async context */ - ucs_async_context_t *ucs_async; - /** transport for active messaging */ mca_btl_uct_tl_t *am_tl; /** transport for RDMA/AMOs */ mca_btl_uct_tl_t *rdma_tl; - /** transport for forming connections (if needed) */ - mca_btl_uct_tl_t *conn_tl; - - /** array containing the am_tl and rdma_tl */ - mca_btl_uct_tl_t *comm_tls[2]; - -#if UCT_API >= UCT_VERSION(1, 7) - uct_component_h uct_component; -#endif - /** registration cache */ mca_rcache_base_module_t *rcache; - /** name of the memory domain backing this module */ - char *md_name; - /** am and rdma share endpoints */ bool shared_endpoints; @@ -119,8 +106,9 @@ struct mca_btl_uct_module_t { /** frags that were waiting on connections that are now ready to send */ opal_list_t pending_frags; - /** pending connection requests */ - opal_fifo_t pending_connection_reqs; + /** allowed transports */ + char *allowed_transports; + mca_btl_uct_include_list_t allowed_transport_list; }; typedef struct mca_btl_uct_module_t mca_btl_uct_module_t; @@ -133,6 +121,9 @@ struct mca_btl_uct_component_t { /** base BTL component */ mca_btl_base_component_3_0_0_t super; + /** whether the component is initialized. controls cleanup. */ + bool initialized; + /** number of TL modules */ int module_count; @@ -141,10 +132,15 @@ struct mca_btl_uct_component_t { /** allowed UCT memory domains */ char *memory_domains; + mca_btl_uct_include_list_t memory_domain_list; /** allowed transports */ char *allowed_transports; + /** transports to consider for forming connections */ + char *connection_domains; + mca_btl_uct_include_list_t connection_domain_list; + /** number of worker contexts to create */ int num_contexts_per_module; @@ -158,6 +154,17 @@ struct mca_btl_uct_component_t { /** connection retry timeout */ unsigned int connection_retry_timeout; + +#if UCT_API >= UCT_VERSION(1, 7) + uct_component_h *uct_components; + unsigned num_uct_components; +#endif + + /** list of memory domains (btl_uct_md_t) */ + opal_list_t md_list; + + /** connection transport (if needed). reference is owned by conn_md */ + mca_btl_uct_tl_t *conn_tl; }; typedef struct mca_btl_uct_component_t mca_btl_uct_component_t; @@ -293,11 +300,16 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep(struct mca_btl_base_module_t *module, opal_proc_t *proc); -int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, - uct_tl_resource_desc_t *tl_descs, unsigned tl_count); +int mca_btl_uct_populate_tls(mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count); int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, mca_btl_uct_conn_req_t *req); +mca_btl_uct_module_t *mca_btl_uct_alloc_module(mca_btl_uct_md_t *md, + size_t registration_size); + +int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl); +int mca_btl_uct_enable_tl_conn(mca_btl_uct_tl_t *tl); + /** * @brief Checks if a tl is suitable for using for RDMA * @@ -305,7 +317,7 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, */ static inline bool mca_btl_uct_tl_supports_rdma(mca_btl_uct_tl_t *tl) { - return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags + return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY)) == (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY); } @@ -315,7 +327,7 @@ static inline bool mca_btl_uct_tl_supports_rdma(mca_btl_uct_tl_t *tl) */ static inline bool mca_btl_uct_tl_support_am(mca_btl_uct_tl_t *tl) { - return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags + return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_AM_ZCOPY)); } @@ -326,7 +338,7 @@ static inline bool mca_btl_uct_tl_support_am(mca_btl_uct_tl_t *tl) */ static inline bool mca_btl_uct_tl_supports_conn(mca_btl_uct_tl_t *tl) { - return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags + return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE)) == (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE); } @@ -338,7 +350,11 @@ static inline bool mca_btl_uct_tl_supports_conn(mca_btl_uct_tl_t *tl) */ static inline bool mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_tl_t *tl) { - return !(MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); + if (NULL == tl) { + return false; + } + + return !(tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); } END_C_DECLS diff --git a/opal/mca/btl/uct/btl_uct_am.c b/opal/mca/btl/uct/btl_uct_am.c index 1aae456842c..913f90d7949 100644 --- a/opal/mca/btl/uct/btl_uct_am.c +++ b/opal/mca/btl/uct/btl_uct_am.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2018 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2025 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -26,7 +27,7 @@ mca_btl_base_descriptor_t *mca_btl_uct_alloc(mca_btl_base_module_t *btl, mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; mca_btl_uct_base_frag_t *frag = NULL; - if (size <= (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) { + if (size <= (size_t) uct_btl->am_tl->uct_iface_attr.cap.am.max_short) { frag = mca_btl_uct_frag_alloc_short(uct_btl, endpoint); } else if (size <= uct_btl->super.btl_eager_limit) { frag = mca_btl_uct_frag_alloc_eager(uct_btl, endpoint); @@ -40,7 +41,9 @@ mca_btl_base_descriptor_t *mca_btl_uct_alloc(mca_btl_base_module_t *btl, frag->base.des_segment_count = 1; frag->base.des_flags = flags; frag->base.order = order; - frag->uct_iov.length = size; + frag->uct_iov[1].length = size; + frag->uct_iov[2].length = 0; + frag->uct_iov_count = 2; if (NULL != frag->base.super.registration) { /* zero-copy fragments will need callbacks */ frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; @@ -55,7 +58,6 @@ static inline void _mca_btl_uct_send_pack(void *data, void *header, size_t heade { uint32_t iov_count = 1; struct iovec iov; - size_t length; if (header_size > 0) { assert(NULL != header); @@ -89,11 +91,11 @@ struct mca_btl_base_descriptor_t *mca_btl_uct_prepare_src(mca_btl_base_module_t return NULL; } - _mca_btl_uct_send_pack((void *) ((intptr_t) frag->uct_iov.buffer + reserve), NULL, 0, + _mca_btl_uct_send_pack((void *) ((intptr_t) frag->uct_iov[1].buffer + reserve), NULL, 0, convertor, size); /* update the length of the fragment according to the convertor packed data */ frag->segments[0].seg_len = reserve + *size; - frag->uct_iov.length = frag->segments[0].seg_len; + frag->uct_iov[1].length = frag->segments[0].seg_len; } else { opal_convertor_get_current_pointer(convertor, &data_ptr); assert(NULL != data_ptr); @@ -103,20 +105,21 @@ struct mca_btl_base_descriptor_t *mca_btl_uct_prepare_src(mca_btl_base_module_t return NULL; } - frag->uct_iov.length = total_size; + frag->uct_iov[1].length = reserve; + + /* user data */ + frag->uct_iov[2].length = *size; + frag->uct_iov[2].buffer = data_ptr; + + frag->uct_iov_count = 3; + frag->base.order = order; frag->base.des_flags = flags; - if (total_size > (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) { - frag->segments[0].seg_len = reserve; - frag->segments[1].seg_len = *size; - frag->segments[1].seg_addr.pval = data_ptr; - frag->base.des_segment_count = 2; - } else { - frag->segments[0].seg_len = total_size; - memcpy((void *) ((intptr_t) frag->segments[0].seg_addr.pval + reserve), data_ptr, - *size); - frag->base.des_segment_count = 1; - } + + frag->segments[0].seg_len = reserve; + frag->segments[1].seg_len = *size; + frag->segments[1].seg_addr.pval = data_ptr; + frag->base.des_segment_count = 2; } return &frag->base; @@ -167,7 +170,7 @@ static void mca_btl_uct_append_pending_frag(mca_btl_uct_module_t *uct_btl, int mca_btl_uct_send_frag(mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t *frag, bool append) { mca_btl_uct_device_context_t *context = frag->context; - const ssize_t msg_size = frag->uct_iov.length + 8; + const ssize_t msg_size = frag->uct_iov[0].length + frag->uct_iov[1].length + frag->uct_iov[2].length; ssize_t size; ucs_status_t ucs_status; uct_ep_h ep_handle = NULL; @@ -182,10 +185,10 @@ int mca_btl_uct_send_frag(mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t mca_btl_uct_context_lock(context); /* attempt to post the fragment */ if (NULL != frag->base.super.registration - && (context->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_AM_ZCOPY)) { + && (uct_btl->am_tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_AM_ZCOPY)) { frag->comp.dev_context = context; - ucs_status = uct_ep_am_zcopy(ep_handle, MCA_BTL_UCT_FRAG, &frag->header, - sizeof(frag->header), &frag->uct_iov, 1, 0, + ucs_status = uct_ep_am_zcopy(ep_handle, MCA_BTL_UCT_FRAG, frag->uct_iov[0].buffer, + frag->uct_iov[0].length, &frag->uct_iov[1], 1, 0, &frag->comp.uct_comp); if (OPAL_LIKELY(UCS_INPROGRESS == ucs_status)) { @@ -195,12 +198,8 @@ int mca_btl_uct_send_frag(mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t } } else { /* short message */ - if (1 == frag->base.des_segment_count - && (frag->uct_iov.length + 8) - < MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) { - ucs_status = uct_ep_am_short(ep_handle, MCA_BTL_UCT_FRAG, frag->header.value, - frag->uct_iov.buffer, frag->uct_iov.length); - + if (msg_size < uct_btl->am_tl->uct_iface_attr.cap.am.max_short) { + ucs_status = uct_ep_am_short_iov(ep_handle, MCA_BTL_UCT_FRAG, frag->uct_iov, frag->uct_iov_count); if (OPAL_LIKELY(UCS_OK == ucs_status)) { uct_worker_progress(context->uct_worker); mca_btl_uct_context_unlock(context); @@ -233,7 +232,7 @@ int mca_btl_uct_send_frag(mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t } OPAL_THREAD_LOCK(&uct_btl->lock); - mca_btl_uct_append_pending_frag(uct_btl, frag, context, true); + mca_btl_uct_append_pending_frag(uct_btl, frag, context, /*ready=*/true); OPAL_THREAD_UNLOCK(&uct_btl->lock); return OPAL_SUCCESS; @@ -250,7 +249,7 @@ int mca_btl_uct_send(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi BTL_VERBOSE(("btl/uct sending descriptor %p from %d -> %d. length = %" PRIu64, (void *) descriptor, OPAL_PROC_MY_NAME.vpid, endpoint->ep_proc->proc_name.vpid, - frag->uct_iov.length)); + frag->uct_iov[0].length + frag->uct_iov[1].length + frag->uct_iov[2].length)); frag->header.data.tag = tag; frag->context = context; @@ -260,14 +259,14 @@ int mca_btl_uct_send(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi OPAL_THREAD_LOCK(&uct_btl->lock); /* check one more time in case another thread is completing the connection now */ if (OPAL_SUCCESS != mca_btl_uct_endpoint_test_am(uct_btl, endpoint, context, &ep_handle)) { - mca_btl_uct_append_pending_frag(uct_btl, frag, context, false); + mca_btl_uct_append_pending_frag(uct_btl, frag, context, /*ready=*/false); OPAL_THREAD_UNLOCK(&uct_btl->lock); return OPAL_SUCCESS; } OPAL_THREAD_UNLOCK(&uct_btl->lock); } - return mca_btl_uct_send_frag(uct_btl, frag, true); + return mca_btl_uct_send_frag(uct_btl, frag, /*append=*/true); } struct mca_btl_uct_sendi_pack_args_t { @@ -291,9 +290,9 @@ static size_t mca_btl_uct_sendi_pack(void *data, void *arg) return args->header_size + args->payload_size + 8; } -static inline size_t mca_btl_uct_max_sendi(mca_btl_uct_module_t *uct_btl, int context_id) +static inline size_t mca_btl_uct_max_sendi(mca_btl_uct_module_t *uct_btl) { - return MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, context_id).cap.am.max_bcopy; + return uct_btl->am_tl->uct_iface_attr.cap.am.max_bcopy; } int mca_btl_uct_sendi(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, @@ -313,7 +312,7 @@ int mca_btl_uct_sendi(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpo rc = mca_btl_uct_endpoint_check_am(uct_btl, endpoint, context, &ep_handle); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc - || msg_size > mca_btl_uct_max_sendi(uct_btl, context->context_id))) { + || msg_size > mca_btl_uct_max_sendi(uct_btl))) { if (descriptor) { *descriptor = mca_btl_uct_alloc(btl, endpoint, order, total_size, flags); } @@ -327,7 +326,7 @@ int mca_btl_uct_sendi(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpo if (0 == payload_size) { ucs_status = uct_ep_am_short(ep_handle, MCA_BTL_UCT_FRAG, am_header.value, header, header_size); - } else if (msg_size < (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, context->context_id) + } else if (msg_size < (size_t) uct_btl->am_tl->uct_iface_attr .cap.am.max_short) { int8_t *data = alloca(total_size); size_t packed_payload_size = payload_size; diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 3d7f2fb65f6..7c5999facad 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -29,6 +29,8 @@ #include "opal_config.h" +#include "btl_uct_discover.h" +#include "btl_uct_modex.h" #include "opal/mca/btl/base/base.h" #include "opal/mca/btl/btl.h" #include "opal/mca/hwloc/base/base.h" @@ -44,21 +46,43 @@ #include "btl_uct_am.h" #include "btl_uct_device_context.h" -static int mca_btl_uct_component_register(void) +static void mca_btl_uct_cleanup(void) { - mca_btl_uct_module_t *module = &mca_btl_uct_module_template; + if (!mca_btl_uct_component.initialized) { + return; + } + + BTL_VERBOSE(("in UCT btl cleanup")); + + OBJ_DESTRUCT(&mca_btl_uct_component.memory_domain_list); + OBJ_DESTRUCT(&mca_btl_uct_component.connection_domain_list); + + OPAL_LIST_DESTRUCT(&mca_btl_uct_component.md_list); + +#if UCT_API >= UCT_VERSION(1, 7) + if (NULL != mca_btl_uct_component.uct_components) { + uct_release_component_list(mca_btl_uct_component.uct_components); + mca_btl_uct_component.uct_components = NULL; + mca_btl_uct_component.num_uct_components = 0; + } +#endif - mca_btl_uct_component.memory_domains = "mlx5_0,mlx4_0,rocep0s4"; + mca_btl_uct_component.initialized = false; +} + +static int mca_btl_uct_component_register(void) +{ + mca_btl_uct_component.memory_domains = "mlx5_0,mlx4_0,rocep0s4,irdma0"; (void) mca_base_component_var_register( &mca_btl_uct_component.super.btl_version, "memory_domains", "Comma-delimited list of memory domains of the form " "to use for communication. Memory domains MUST provide transports that " "support put, get, and amos. Special values: all (all available), none." - " (default: mlx5_0,mlx4_0,rocep0s4)", + " (default: mlx5_0,mlx4_0,rocep0s4,irdma0)", MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.memory_domains); - mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,ud,ugni_rdma,ugni_smsg,any"; + mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,rc_verbs,ud,ud_verbs,ugni_rdma,ugni_smsg,any"; (void) mca_base_component_var_register( &mca_btl_uct_component.super.btl_version, "transports", "Comma-delimited list of transports to use sorted by increasing " @@ -67,6 +91,15 @@ static int mca_btl_uct_component_register(void) MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.allowed_transports); + mca_btl_uct_component.connection_domains = "tcp"; + (void) mca_base_component_var_register( + &mca_btl_uct_component.super.btl_version, "connection_domains", + "Comma-delimited list of connection-only domains to use sorted by increasing " + "priority. The list of transports available can be queried using ucx_info. Special" + "values: any (any available) (default: tcp)", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.connection_domains); + mca_btl_uct_component.num_contexts_per_module = 0; (void) mca_base_component_var_register( &mca_btl_uct_component.super.btl_version, "num_contexts_per_module", @@ -113,10 +146,24 @@ static int mca_btl_uct_component_register(void) MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_4, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.connection_retry_timeout); - /* for now we want this component to lose to btl/ugni and btl/vader */ - module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 1; + OBJ_CONSTRUCT(&mca_btl_uct_component.md_list, opal_list_t); + OBJ_CONSTRUCT(&mca_btl_uct_component.memory_domain_list, mca_btl_uct_include_list_t); + OBJ_CONSTRUCT(&mca_btl_uct_component.connection_domain_list, mca_btl_uct_include_list_t); - return mca_btl_base_param_register(&mca_btl_uct_component.super.btl_version, &module->super); + int rc = mca_btl_uct_component_discover_mds(); + if (OPAL_SUCCESS != rc) { + return rc; + } + + rc = mca_btl_uct_component_generate_modules(&mca_btl_uct_component.md_list); + if (OPAL_SUCCESS != rc) { + return rc; + } + + mca_btl_uct_component.initialized = true; + opal_finalize_register_cleanup(mca_btl_uct_cleanup); + + return OPAL_SUCCESS; } static void mca_btl_uct_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc) @@ -167,155 +214,16 @@ static int mca_btl_uct_component_open(void) */ static int mca_btl_uct_component_close(void) { + mca_btl_uct_component.conn_tl = NULL; + if (mca_btl_uct_component.disable_ucx_memory_hooks) { opal_mem_hooks_unregister_release(mca_btl_uct_mem_release_cb); } - return OPAL_SUCCESS; -} - -static size_t mca_btl_uct_tl_modex_size(mca_btl_uct_tl_t *tl) -{ - const size_t size = strlen(tl->uct_tl_name) + 1; - - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { - /* pad out to a multiple of 4 bytes */ - return (4 + 3 + size + MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len - + MCA_BTL_UCT_TL_ATTR(tl, 0).iface_addr_len) - & ~3; - } - - return (4 + 3 + size + MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len) & ~3; -} - -static size_t mca_btl_uct_module_modex_size(mca_btl_uct_module_t *module) -{ - size_t modex_size = 4 + strlen(module->md_name) + 1; - - if (module->rdma_tl) { - modex_size += mca_btl_uct_tl_modex_size(module->rdma_tl); - } - - if (module->am_tl && module->am_tl != module->rdma_tl) { - modex_size += mca_btl_uct_tl_modex_size(module->am_tl); - } - - if (module->conn_tl && module->conn_tl != module->rdma_tl && module->conn_tl != module->am_tl) { - modex_size += mca_btl_uct_tl_modex_size(module->conn_tl); - } - - return modex_size; -} - -static size_t mca_btl_uct_tl_modex_pack(mca_btl_uct_tl_t *tl, uint8_t *modex_data) -{ - mca_btl_uct_device_context_t *dev_context = tl->uct_dev_contexts[0]; - size_t modex_size = mca_btl_uct_tl_modex_size(tl); - - *((uint32_t *) modex_data) = (uint32_t) modex_size; - modex_data += 4; - - strcpy((char *) modex_data, tl->uct_tl_name); - modex_data += strlen(tl->uct_tl_name) + 1; - - /* NTH: only the first context is available. i assume the device addresses of the - * contexts will be the same but they will have different iface addresses. i also - * am assuming that it doesn't really matter if all remote contexts connect to - * the same endpoint since we are only doing RDMA. if any of these assumptions are - * wrong then we can't delay creating the other contexts and must include their - * information in the modex. */ - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { - uct_iface_get_address(dev_context->uct_iface, (uct_iface_addr_t *) modex_data); - modex_data += MCA_BTL_UCT_TL_ATTR(tl, 0).iface_addr_len; - } + /* complete delayed cleanup */ + mca_btl_uct_cleanup(); - uct_iface_get_device_address(dev_context->uct_iface, (uct_device_addr_t *) modex_data); - modex_data += MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len; - - return modex_size; -} - -static int mca_btl_uct_modex_send(void) -{ - size_t modex_size = sizeof(mca_btl_uct_modex_t); - mca_btl_uct_modex_t *modex; - uint8_t *modex_data; - int rc; - - for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { - modex_size += mca_btl_uct_module_modex_size(mca_btl_uct_component.modules[i]); - } - - modex = alloca(modex_size); - modex_data = modex->data; - - modex->module_count = mca_btl_uct_component.module_count; - - for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { - mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; - size_t name_len = strlen(module->md_name); - - /* pack the size */ - *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size(module); - - modex_data += 4; - - strcpy((char *) modex_data, module->md_name); - modex_data += name_len + 1; - - if (module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->rdma_tl, modex_data); - } - - if (module->am_tl && module->am_tl != module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->am_tl, modex_data); - } - - if (module->conn_tl && module->conn_tl != module->rdma_tl - && module->conn_tl != module->am_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->conn_tl, modex_data); - } - } - - OPAL_MODEX_SEND(rc, PMIX_GLOBAL, &mca_btl_uct_component.super.btl_version, modex, modex_size); - return rc; -} - -static mca_btl_uct_module_t *mca_btl_uct_alloc_module(const char *md_name, mca_btl_uct_md_t *md, - size_t registration_size) -{ - mca_btl_uct_module_t *module; - ucs_status_t ucs_status; - - module = malloc(sizeof(*module)); - if (NULL == module) { - return NULL; - } - - /* copy the module template */ - *module = mca_btl_uct_module_template; - - OBJ_CONSTRUCT(&module->id_to_endpoint, opal_hash_table_t); - OBJ_CONSTRUCT(&module->endpoint_lock, opal_mutex_t); - OBJ_CONSTRUCT(&module->short_frags, opal_free_list_t); - OBJ_CONSTRUCT(&module->eager_frags, opal_free_list_t); - OBJ_CONSTRUCT(&module->max_frags, opal_free_list_t); - OBJ_CONSTRUCT(&module->pending_frags, opal_list_t); - OBJ_CONSTRUCT(&module->lock, opal_recursive_mutex_t); - OBJ_CONSTRUCT(&module->pending_connection_reqs, opal_fifo_t); - - module->md = md; - module->md_name = strdup(md_name); - module->super.btl_registration_handle_size = registration_size; - - ucs_status = ucs_async_context_create(UCS_ASYNC_MODE_THREAD, &module->ucs_async); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("Could not create a UCT async context")); - mca_btl_uct_finalize(&module->super); - return NULL; - } - - return module; + return OPAL_SUCCESS; } ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsigned flags) @@ -341,170 +249,6 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign return UCS_OK; } -#if UCT_API >= UCT_VERSION(1, 7) -static int mca_btl_uct_component_process_uct_md(uct_component_h component, - uct_md_resource_desc_t *md_desc, - char **allowed_ifaces) -#else -static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, - char **allowed_ifaces) -#endif -{ - mca_rcache_base_resources_t rcache_resources; - uct_tl_resource_desc_t *tl_desc; - mca_btl_uct_module_t *module; - uct_md_config_t *uct_config; - uct_md_attr_t md_attr; - mca_btl_uct_md_t *md; - bool found = false; - unsigned num_tls; - char *tmp; - ucs_status_t ucs_status; - - if (MCA_BTL_UCT_MAX_MODULES == mca_btl_uct_component.module_count) { - BTL_VERBOSE(("created the maximum number of allowable modules")); - return OPAL_ERR_NOT_AVAILABLE; - } - - BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); - - for (int j = 0; allowed_ifaces[j]; ++j) { - if (0 == strncmp(allowed_ifaces[j], md_desc->md_name, strlen(md_desc->md_name)) - || 0 == strcmp(allowed_ifaces[j], "all")) { - found = true; - break; - } - } - - if (!found) { - /* nothing to do */ - return OPAL_SUCCESS; - } - - md = OBJ_NEW(mca_btl_uct_md_t); - -#if UCT_API >= UCT_VERSION(1, 7) - ucs_status = uct_md_config_read(component, NULL, NULL, &uct_config); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_md_config_read failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } - ucs_status = uct_md_open(component, md_desc->md_name, uct_config, &md->uct_md); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_md_open failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } -#else - ucs_status = uct_md_config_read(md_desc->md_name, NULL, NULL, &uct_config); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_md_config_read failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } - ucs_status = uct_md_open(md_desc->md_name, uct_config, &md->uct_md); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_md_open failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } -#endif - uct_config_release(uct_config); - - ucs_status = uct_md_query(md->uct_md, &md_attr); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_config_release failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } - ucs_status = uct_md_query_tl_resources(md->uct_md, &tl_desc, &num_tls); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_config_release failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } - - module = mca_btl_uct_alloc_module(md_desc->md_name, md, md_attr.rkey_packed_size); - if (NULL == module) { - uct_release_tl_resource_list(tl_desc); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - (void) mca_btl_uct_query_tls(module, md, tl_desc, num_tls); - - uct_release_tl_resource_list(tl_desc); - - /* release the initial reference to the md object. if any modules were created the UCT md will - * remain open until those modules are finalized. */ - OBJ_RELEASE(md); - - if (NULL == module->am_tl && NULL == module->rdma_tl) { - BTL_VERBOSE(("uct memory domain %s does not have any appropriate tls", md_desc->md_name)); - mca_btl_uct_finalize(&module->super); - return OPAL_ERR_NOT_AVAILABLE; - } - -#if UCT_API >= UCT_VERSION(1, 7) - module->uct_component = component; -#endif - - mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; - - /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable - * performance benefits to using rcache/grdma instead of assuming UCT will do the right - * thing. */ - (void) opal_asprintf(&tmp, "uct.%s", module->md_name); - - rcache_resources.cache_name = tmp; - rcache_resources.reg_data = (void *) module; - rcache_resources.sizeof_reg = sizeof(mca_btl_uct_reg_t) - + module->super.btl_registration_handle_size; - rcache_resources.register_mem = mca_btl_uct_reg_mem; - rcache_resources.deregister_mem = mca_btl_uct_dereg_mem; - - module->rcache = mca_rcache_base_module_create("grdma", module, &rcache_resources); - free(tmp); - if (NULL == module->rcache) { - /* something when horribly wrong */ - BTL_VERBOSE(("could not allocate a registration cache for this btl module")); - mca_btl_uct_finalize(&module->super); - return OPAL_ERROR; - } - - return OPAL_SUCCESS; -} - -#if UCT_API >= UCT_VERSION(1, 7) -static int mca_btl_uct_component_process_uct_component(uct_component_h component, - char **allowed_ifaces) -{ - uct_component_attr_t attr = {.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME - | UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT}; - ucs_status_t ucs_status; - int rc; - - ucs_status = uct_component_query(component, &attr); - if (UCS_OK != ucs_status) { - return OPAL_ERROR; - } - - BTL_VERBOSE(("processing uct component %s", attr.name)); - - attr.md_resources = calloc(attr.md_resource_count, sizeof(*attr.md_resources)); - attr.field_mask |= UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES; - ucs_status = uct_component_query(component, &attr); - if (UCS_OK != ucs_status) { - return OPAL_ERROR; - } - - for (unsigned i = 0; i < attr.md_resource_count; ++i) { - rc = mca_btl_uct_component_process_uct_md(component, attr.md_resources + i, allowed_ifaces); - if (OPAL_SUCCESS != rc) { - break; - } - } - - free(attr.md_resources); - - return OPAL_SUCCESS; -} -#endif /* UCT_API >= UCT_VERSION(1, 7) */ - /* * UCT component initialization: * (1) read interface list from kernel and compare against component parameters @@ -520,8 +264,6 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, /* for this BTL to be useful the interface needs to support RDMA and certain atomic operations */ struct mca_btl_base_module_t **base_modules; - ucs_status_t ucs_status; - char **allowed_ifaces; int rc; BTL_VERBOSE(("initializing uct btl")); @@ -533,54 +275,26 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, return NULL; } - allowed_ifaces = opal_argv_split(mca_btl_uct_component.memory_domains, ','); - if (NULL == allowed_ifaces) { + rc = mca_btl_uct_enable_modules(mca_btl_uct_component.modules, mca_btl_uct_component.module_count); + if (OPAL_SUCCESS != rc) { return NULL; } - mca_btl_uct_component.module_count = 0; - -#if UCT_API >= UCT_VERSION(1, 7) - uct_component_h *components; - unsigned num_components; - - ucs_status = uct_query_components(&components, &num_components); - if (UCS_OK != ucs_status) { - BTL_ERROR(("could not query UCT components")); + rc = mca_btl_uct_component_maybe_setup_conn_tl(); + if (OPAL_SUCCESS != rc && OPAL_ERR_NOT_FOUND != rc) { return NULL; } - /* generate all suitable btl modules */ - for (unsigned i = 0; i < num_components; ++i) { - rc = mca_btl_uct_component_process_uct_component(components[i], allowed_ifaces); - if (OPAL_SUCCESS != rc) { - break; - } + rc = mca_btl_uct_component_filter_mds(); + if (OPAL_SUCCESS != rc) { + return NULL; } - uct_release_component_list(components); - -#else /* UCT 1.6 and older */ - uct_md_resource_desc_t *resources; - unsigned resource_count; - - uct_query_md_resources(&resources, &resource_count); - - /* generate all suitable btl modules */ - for (unsigned i = 0; i < resource_count; ++i) { - rc = mca_btl_uct_component_process_uct_md(resources + i, allowed_ifaces); - if (OPAL_SUCCESS != rc) { - break; - } + rc = mca_btl_uct_component_modex_send(); + if (OPAL_SUCCESS != rc) { + return NULL; } - uct_release_md_resource_list(resources); - -#endif /* UCT_API >= UCT_VERSION(1, 7) */ - - opal_argv_free(allowed_ifaces); - mca_btl_uct_modex_send(); - /* pass module array back to caller */ base_modules = calloc(mca_btl_uct_component.module_count, sizeof(*base_modules)); if (NULL == base_modules) { @@ -633,7 +347,7 @@ static int mca_btl_uct_component_progress_pending(mca_btl_uct_module_t *uct_btl) opal_list_remove_item(&uct_btl->pending_frags, (opal_list_item_t *) frag); - if (OPAL_SUCCESS > mca_btl_uct_send_frag(uct_btl, frag, false)) { + if (OPAL_SUCCESS > mca_btl_uct_send_frag(uct_btl, frag, /*append=*/false)) { opal_list_prepend(&uct_btl->pending_frags, (opal_list_item_t *) frag); } else { completed++; @@ -644,6 +358,36 @@ static int mca_btl_uct_component_progress_pending(mca_btl_uct_module_t *uct_btl) return completed; } +static int mca_btl_uct_component_progress_connections (mca_btl_uct_tl_t *conn_tl) { + mca_btl_uct_pending_connection_request_t *request; + int ret; + + if (conn_tl == NULL) { + return 0; + } + + ret = mca_btl_uct_tl_progress(conn_tl, 0); + + while (NULL + != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic( + &conn_tl->pending_connection_reqs))) { + mca_btl_uct_conn_req_t *conn_req = (mca_btl_uct_conn_req_t *) request->request_data; + BTL_VERBOSE(("processing connection request....")); + if (conn_req->module_index >= mca_btl_uct_component.module_count) { + BTL_ERROR(("invalid connection request received")); + abort(); + } + int rc = mca_btl_uct_process_connection_request(mca_btl_uct_component.modules[conn_req->module_index], conn_req); + if (rc != OPAL_SUCCESS) { + opal_fifo_push_atomic(&conn_tl->pending_connection_reqs, &request->super); + break; + } + OBJ_RELEASE(request); + } + + return ret; +} + /** * @brief UCT BTL progress function * @@ -654,38 +398,28 @@ static int mca_btl_uct_component_progress(void) int starting_index = mca_btl_uct_get_context_index(); unsigned ret = 0; - for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { - mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; - + mca_btl_uct_md_t *md; + OPAL_LIST_FOREACH(md, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { /* unlike ucp, uct actually tells us something useful! its almost like it was "inspired" * by the btl progress functions.... */ - ret += mca_btl_uct_tl_progress(module->rdma_tl, starting_index); - - if (module->am_tl != module->rdma_tl) { - ret += mca_btl_uct_tl_progress(module->am_tl, starting_index); - } - - if (module->conn_tl) { - mca_btl_uct_pending_connection_request_t *request; - - if (module->conn_tl != module->am_tl && module->conn_tl != module->rdma_tl) { - ret += mca_btl_uct_tl_progress(module->conn_tl, 0); - } - - while (NULL - != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic( - &module->pending_connection_reqs))) { - mca_btl_uct_process_connection_request(module, (mca_btl_uct_conn_req_t *) - request->request_data); - OBJ_RELEASE(request); - } + mca_btl_uct_tl_t *tl; + OPAL_LIST_FOREACH(tl, &md->tls, mca_btl_uct_tl_t) { + ret += mca_btl_uct_tl_progress(tl, starting_index); } + } + + for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; if (0 != opal_list_get_size(&module->pending_frags)) { mca_btl_uct_component_progress_pending(module); } } + if (NULL != mca_btl_uct_component.conn_tl) { + ret += mca_btl_uct_component_progress_connections (mca_btl_uct_component.conn_tl); + } + return (int) ret; } diff --git a/opal/mca/btl/uct/btl_uct_device_context.c b/opal/mca/btl/uct/btl_uct_device_context.c new file mode 100644 index 00000000000..60cd8ad4385 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_device_context.c @@ -0,0 +1,154 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2018 Triad National Security, LLC. All rights + * reserved. + * Copyright (c) 2019-2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include +#include +#include +#include + +#include "btl_uct.h" +#include "btl_uct_device_context.h" +#include "btl_uct_types.h" + +#include "opal/class/opal_free_list.h" +#include "opal/class/opal_object.h" + +#if HAVE_DECL_UCT_CB_FLAG_SYNC +# define MCA_BTL_UCT_CB_FLAG_SYNC UCT_CB_FLAG_SYNC +#else +# define MCA_BTL_UCT_CB_FLAG_SYNC 0 +#endif + +static void mca_btl_uct_context_enable_progress(mca_btl_uct_device_context_t *context) +{ + if (!context->progress_enabled) { +#if HAVE_DECL_UCT_PROGRESS_THREAD_SAFE + uct_iface_progress_enable(context->uct_iface, + UCT_PROGRESS_THREAD_SAFE | UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); +#else + uct_iface_progress_enable(context->uct_iface, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); +#endif + context->progress_enabled = true; + } +} + +void mca_btl_uct_context_enable_am_handler(mca_btl_uct_tl_t *tl, + mca_btl_uct_device_context_t *context) +{ + if (context->am_handler_installed) { + return; + } + + BTL_VERBOSE(("installing AM handler for tl %s::%s context id %d", + tl->uct_md->md_name, tl->uct_tl_name, context->context_id)); + uct_iface_set_am_handler(context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler, + context, MCA_BTL_UCT_CB_FLAG_SYNC); + context->am_handler_installed = true; +} + +mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *module, + mca_btl_uct_tl_t *tl, int context_id, + bool enable_progress) +{ +#if UCT_API >= UCT_VERSION(1, 6) + uct_iface_params_t iface_params = {.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE + | UCT_IFACE_PARAM_FIELD_DEVICE, + .open_mode = UCT_IFACE_OPEN_MODE_DEVICE, + .mode = {.device = {.tl_name = tl->uct_tl_name, + .dev_name = tl->uct_dev_name}}}; +#else + uct_iface_params_t iface_params = {.rndv_cb = NULL, + .eager_cb = NULL, + .stats_root = NULL, + .rx_headroom = 0, + .open_mode = UCT_IFACE_OPEN_MODE_DEVICE, + .mode = {.device = {.tl_name = tl->uct_tl_name, + .dev_name = tl->uct_dev_name}}}; +#endif + mca_btl_uct_device_context_t *context; + ucs_status_t ucs_status; + int rc; + + context = calloc(1, sizeof(*context)); + if (OPAL_UNLIKELY(NULL == context)) { + return NULL; + } + + context->context_id = context_id; + context->uct_btl = module; + OBJ_CONSTRUCT(&context->completion_fifo, opal_fifo_t); + OBJ_CONSTRUCT(&context->mutex, opal_recursive_mutex_t); + OBJ_CONSTRUCT(&context->rdma_completions, opal_free_list_t); + + rc = opal_free_list_init(&context->rdma_completions, sizeof(mca_btl_uct_uct_completion_t), + opal_cache_line_size, OBJ_CLASS(mca_btl_uct_uct_completion_t), 0, + opal_cache_line_size, 0, 4096, 128, NULL, 0, NULL, NULL, NULL); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + mca_btl_uct_context_destroy(context); + return NULL; + } + + /* apparently (in contradiction to the spec) UCT is *not* thread safe. because we have to + * use our own locks just go ahead and use UCS_THREAD_MODE_SINGLE. if they ever fix their + * api then change this back to UCS_THREAD_MODE_MULTI and remove the locks around the + * various UCT calls. */ + ucs_status = uct_worker_create(tl->ucs_async, UCS_THREAD_MODE_SINGLE, &context->uct_worker); + if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { + BTL_VERBOSE(("could not create a UCT worker")); + mca_btl_uct_context_destroy(context); + return NULL; + } + + ucs_status = uct_iface_open(tl->uct_md->uct_md, context->uct_worker, &iface_params, + tl->uct_tl_config, &context->uct_iface); + if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { + BTL_VERBOSE(("could not open UCT interface. error code: %d", ucs_status)); + mca_btl_uct_context_destroy(context); + return NULL; + } + + if (module != NULL && tl == module->am_tl) { + mca_btl_uct_context_enable_am_handler(tl, context); + } + + if (enable_progress) { + BTL_VERBOSE(("enabling progress for tl %s::%s context id %d", + tl->uct_md->md_name, tl->uct_tl_name, context_id)); + mca_btl_uct_context_enable_progress(context); + } + + return context; +} + +void mca_btl_uct_context_destroy(mca_btl_uct_device_context_t *context) +{ + if (context->uct_iface) { + uct_iface_close(context->uct_iface); + context->uct_iface = NULL; + } + + if (context->uct_worker) { + uct_worker_destroy(context->uct_worker); + context->uct_worker = NULL; + } + + OBJ_DESTRUCT(&context->completion_fifo); + OBJ_DESTRUCT(&context->rdma_completions); + free(context); +} + diff --git a/opal/mca/btl/uct/btl_uct_device_context.h b/opal/mca/btl/uct/btl_uct_device_context.h index 7e25e0bef19..915a9868d3a 100644 --- a/opal/mca/btl/uct/btl_uct_device_context.h +++ b/opal/mca/btl/uct/btl_uct_device_context.h @@ -37,6 +37,15 @@ mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *m */ void mca_btl_uct_context_destroy(mca_btl_uct_device_context_t *context); +/** + * @brief Enable active messages on context if not already enabled + * + * @param[in] tl TL this context belongs to + * @param[in] context Context to enable active messages on. + */ +void mca_btl_uct_context_enable_am_handler(mca_btl_uct_tl_t *tl, + mca_btl_uct_device_context_t *context); + static inline bool mca_btl_uct_context_trylock(mca_btl_uct_device_context_t *context) { return OPAL_THREAD_TRYLOCK(&context->mutex); @@ -94,14 +103,14 @@ mca_btl_uct_module_get_tl_context_specific(mca_btl_uct_module_t *module, mca_btl mca_btl_uct_device_context_t *context = tl->uct_dev_contexts[context_id]; if (OPAL_UNLIKELY(NULL == context)) { - OPAL_THREAD_LOCK(&module->lock); + OPAL_THREAD_LOCK(&tl->tl_lock); context = tl->uct_dev_contexts[context_id]; if (OPAL_UNLIKELY(NULL == context)) { context = tl->uct_dev_contexts[context_id] = mca_btl_uct_context_create(module, tl, context_id, true); } - OPAL_THREAD_UNLOCK(&module->lock); + OPAL_THREAD_UNLOCK(&tl->tl_lock); } return context; diff --git a/opal/mca/btl/uct/btl_uct_discover.c b/opal/mca/btl/uct/btl_uct_discover.c new file mode 100644 index 00000000000..1461f6b1678 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_discover.c @@ -0,0 +1,522 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. + * Copyright (c) 2018-2024 Triad National Security, LLC. All rights + * reserved. + * Copyright (c) 2019-2025 Google, LLC. All rights reserved. + * Copyright (c) 2019 Intel, Inc. All rights reserved. + * Copyright (c) 2022 IBM Corporation. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include "btl_uct_device_context.h" +#include "btl_uct_discover.h" +#include "btl_uct_include_list.h" + +#include "btl_uct.h" +#include "opal/class/opal_list.h" +#include "opal/util/printf.h" + +#if UCT_API >= UCT_VERSION(1, 7) +static int mca_btl_uct_component_process_uct_md(uct_component_h component, + uct_md_resource_desc_t *md_desc) +#else +static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) +#endif +{ + uct_tl_resource_desc_t *tl_desc; + uct_md_config_t *uct_config; + mca_btl_uct_md_t *md; + int list_rank; + unsigned num_tls; + ucs_status_t ucs_status; + int connection_list_rank = -1; + bool consider_for_connection_module = false; + + BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); + + BTL_VERBOSE(("checking if %s should be used for communication", md_desc->md_name)); + list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.memory_domain_list); + + if (list_rank < 0) { + BTL_VERBOSE(("checking if %s should be used for connections", md_desc->md_name)); + connection_list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.connection_domain_list); + + if (connection_list_rank < 0) { + /* nothing to do */ + BTL_VERBOSE(("not continuing with memory domain %s", md_desc->md_name)); + return OPAL_SUCCESS; + } + + BTL_VERBOSE(("will be considering domain %s for connections only", md_desc->md_name)); + consider_for_connection_module = true; + } + + md = OBJ_NEW(mca_btl_uct_md_t); + md->md_name = strdup(md_desc->md_name); +#if UCT_API >= UCT_VERSION(1, 7) + md->uct_component = component; +#endif + md->connection_only_domain = consider_for_connection_module; + +#if UCT_API >= UCT_VERSION(1, 7) + ucs_status = uct_md_config_read(component, NULL, NULL, &uct_config); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_md_config_read failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } + + ucs_status = uct_md_open(component, md->md_name, uct_config, &md->uct_md); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_md_open failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } +#else + ucs_status = uct_md_config_read(md->md_name, NULL, NULL, &uct_config); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_md_config_read failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } + + ucs_status = uct_md_open(md->md_name, uct_config, &md->uct_md); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_md_open failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } +#endif + uct_config_release(uct_config); + + ucs_status = uct_md_query(md->uct_md, &md->md_attr); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_config_release failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } + + ucs_status = uct_md_query_tl_resources(md->uct_md, &tl_desc, &num_tls); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_config_release failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } + + (void) mca_btl_uct_populate_tls(md, tl_desc, num_tls); + + uct_release_tl_resource_list(tl_desc); + opal_list_append(&mca_btl_uct_component.md_list, &md->super); + + return OPAL_SUCCESS; +} + +#if UCT_API >= UCT_VERSION(1, 7) +static int mca_btl_uct_component_process_uct_component(uct_component_h component) +{ + uct_component_attr_t attr = { + .field_mask = UCT_COMPONENT_ATTR_FIELD_NAME + | UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT, + }; + ucs_status_t ucs_status; + int rc; + + ucs_status = uct_component_query(component, &attr); + if (UCS_OK != ucs_status) { + return OPAL_ERROR; + } + + BTL_VERBOSE(("processing uct component %s", attr.name)); + + attr.md_resources = calloc(attr.md_resource_count, sizeof(*attr.md_resources)); + attr.field_mask |= UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES; + ucs_status = uct_component_query(component, &attr); + if (UCS_OK != ucs_status) { + return OPAL_ERROR; + } + + for (unsigned i = 0; i < attr.md_resource_count; ++i) { + rc = mca_btl_uct_component_process_uct_md(component, attr.md_resources + i); + if (OPAL_SUCCESS != rc) { + break; + } + } + + free(attr.md_resources); + + return OPAL_SUCCESS; +} +#endif /* UCT_API >= UCT_VERSION(1, 7) */ + +int mca_btl_uct_component_discover_mds(void) +{ + mca_btl_uct_include_list_parse(mca_btl_uct_component.memory_domains, + &mca_btl_uct_component.memory_domain_list); + mca_btl_uct_include_list_parse(mca_btl_uct_component.connection_domains, + &mca_btl_uct_component.connection_domain_list); + +#if UCT_API >= UCT_VERSION(1, 7) + ucs_status_t ucs_status = uct_query_components(&mca_btl_uct_component.uct_components, + &mca_btl_uct_component.num_uct_components); + if (UCS_OK != ucs_status) { + BTL_ERROR(("could not query UCT components")); + return OPAL_ERROR; + } + + /* generate list of memory domains */ + for (unsigned i = 0; i < mca_btl_uct_component.num_uct_components; ++i) { + int rc = mca_btl_uct_component_process_uct_component(mca_btl_uct_component.uct_components[i]); + if (OPAL_SUCCESS != rc) { + break; + } + } +#else /* UCT 1.6 and older */ + uct_md_resource_desc_t *resources; + unsigned resource_count; + + uct_query_md_resources(&resources, &resource_count); + + /* generate all suitable btl modules */ + for (unsigned i = 0; i < resource_count; ++i) { + int rc = mca_btl_uct_component_process_uct_md(resources + i); + if (OPAL_SUCCESS != rc) { + break; + } + } + + uct_release_md_resource_list(resources); + +#endif /* UCT_API >= UCT_VERSION(1, 7) */ + + return OPAL_SUCCESS; +} + +static int mca_btl_uct_module_register_mca_var(mca_btl_uct_module_t *module) +{ + mca_base_component_t dummy_component; + /* mca_btl_uct_component starts with an mca_base_component_t structure */ + memcpy(&dummy_component, &mca_btl_uct_component, sizeof(dummy_component)); + snprintf(dummy_component.mca_component_name, sizeof(dummy_component.mca_component_name), + "uct_%s", module->md->md_name); + + BTL_VERBOSE(("registering MCA parameters for module uct_%s", module->md->md_name)); + + module->allowed_transports = mca_btl_uct_component.allowed_transports; + (void) mca_base_component_var_register( + &dummy_component, "transports", + "Comma-delimited list of transports to use sorted by increasing " + "priority. The list of transports available can be queried using ucx_info. Special" + "values: any (any available) (default: dc_mlx5,rc_mlx5,ud,any)", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &module->allowed_transports); + + return mca_btl_base_param_register(&dummy_component, &module->super); +} + +static int tl_compare(opal_list_item_t **a, opal_list_item_t **b) +{ + mca_btl_uct_tl_t *tl_a = (mca_btl_uct_tl_t *) *a; + mca_btl_uct_tl_t *tl_b = (mca_btl_uct_tl_t *) *b; + + return tl_a->priority - tl_b->priority; +} + +static int mca_btl_uct_generate_module(mca_btl_uct_md_t *md) +{ + mca_btl_uct_tl_t *tl; + mca_btl_uct_module_t *module = mca_btl_uct_alloc_module(md, md->md_attr.rkey_packed_size); + + BTL_VERBOSE(("attempting to create a BTL module for memory domain: %s", md->md_name)); + + int rc = mca_btl_uct_module_register_mca_var(module); + if (OPAL_SUCCESS != rc) { + mca_btl_uct_finalize(&module->super); + return rc; + } + + mca_btl_uct_include_list_parse(module->allowed_transports, + &module->allowed_transport_list); + mca_btl_uct_tl_t *next; + OPAL_LIST_FOREACH_SAFE (tl, next, &md->tls, mca_btl_uct_tl_t) { + int rank = mca_btl_uct_include_list_rank(tl->uct_tl_name, &module->allowed_transport_list); + if (rank < 0) { + opal_list_remove_item(&md->tls, &tl->super); + OBJ_RELEASE(tl); + continue; + } + tl->priority = rank; + } + + opal_list_sort(&md->tls, tl_compare); + + /* Treat the flags specified by the user as a mask. */ + uint32_t btl_flags = module->super.btl_flags; + uint32_t btl_atomic_flags = module->super.btl_atomic_flags; + + module->super.btl_flags = 0; + module->super.btl_atomic_flags = 0; + + OPAL_LIST_FOREACH (tl, &md->tls, mca_btl_uct_tl_t) { + mca_btl_uct_evaluate_tl(module, tl); + if (NULL != module->am_tl && NULL != module->rdma_tl) { + /* all done */ + break; + } + } + + module->super.btl_flags &= btl_flags; + module->super.btl_atomic_flags &= btl_atomic_flags; + + if (NULL == module->rdma_tl) { + /* no rdma tls */ + BTL_VERBOSE(("no rdma tl matched supplied filter. disabling RDMA support")); + + module->super.btl_flags &= ~MCA_BTL_FLAGS_RDMA; + module->super.btl_put = NULL; + module->super.btl_get = NULL; + module->super.btl_atomic_fop = NULL; + module->super.btl_atomic_op = NULL; + } + + if (NULL == module->am_tl) { + /* no active message tls == no send/recv */ + BTL_VERBOSE(("no active message tl matched supplied filter. disabling send/recv support")); + + module->super.btl_send = NULL; + module->super.btl_sendi = NULL; + module->super.btl_alloc = NULL; + module->super.btl_free = NULL; + } + + if (NULL == module->am_tl && NULL == module->rdma_tl) { + mca_btl_uct_finalize(&module->super); + return OPAL_ERR_NOT_AVAILABLE; + } + + module->module_index = mca_btl_uct_component.module_count; + mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; + + return OPAL_SUCCESS; +} + +static void mca_btl_uct_enable_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) { + if (NULL == tl) { + return; + } + + if (tl == module->am_tl) { + mca_btl_uct_device_context_t *context = + mca_btl_uct_module_get_tl_context_specific(module, tl, /*context_id=*/0); + /* If this context was created before a module was created it may not + * have an active message handler installed. Attempt to install one now. */ + mca_btl_uct_context_enable_am_handler(tl, context); + } + + if (tl->max_device_contexts < 1) { + tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module; + } +} + +static int mca_btl_uct_enable_module(mca_btl_uct_module_t *module) +{ + /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable + * performance benefits to using rcache/grdma instead of assuming UCT will do the right + * thing. */ + char *tmp = NULL; + (void) opal_asprintf(&tmp, "uct.%s", module->md->md_name); + + mca_rcache_base_resources_t rcache_resources = { + .cache_name = tmp, + .reg_data = (void *) module, + .sizeof_reg = sizeof(mca_btl_uct_reg_t) + module->super.btl_registration_handle_size, + .register_mem = mca_btl_uct_reg_mem, + .deregister_mem = mca_btl_uct_dereg_mem, + }; + + module->rcache = mca_rcache_base_module_create("grdma", module, &rcache_resources); + free(tmp); + if (NULL == module->rcache) { + /* something went horribly wrong */ + BTL_VERBOSE(("could not allocate a registration cache for this btl module")); + return OPAL_ERROR; + } + + mca_btl_uct_enable_tl(module, module->rdma_tl); + mca_btl_uct_enable_tl(module, module->am_tl); + + return OPAL_SUCCESS; +} + +int mca_btl_uct_enable_modules(mca_btl_uct_module_t **modules, int module_count) +{ + for (int i = 0 ; i < module_count ; ++i) { + int rc = mca_btl_uct_enable_module(modules[i]); + if (OPAL_SUCCESS != rc) { + BTL_VERBOSE(("could not enable module for memory domain %s", modules[i]->md->md_name)); + mca_btl_uct_finalize(&modules[i]->super); + } + } + + return OPAL_SUCCESS; +} + +int mca_btl_uct_component_generate_modules(opal_list_t *md_list) +{ + mca_btl_uct_component.module_count = 0; + + mca_btl_uct_md_t *md; + OPAL_LIST_FOREACH(md, md_list, mca_btl_uct_md_t) { + if (MCA_BTL_UCT_MAX_MODULES == mca_btl_uct_component.module_count) { + BTL_VERBOSE(("created the maximum number of allowable modules")); + break; + } + + if (md->connection_only_domain) { + /* will not build a module for this domain */ + continue; + } + + int rc = mca_btl_uct_generate_module(md); + if (OPAL_SUCCESS != rc) { + BTL_VERBOSE(("could not create a module for memory domain %s", md->md_name)); + } + } + + return OPAL_SUCCESS; +} + +int mca_btl_uct_component_maybe_setup_conn_tl(void) +{ + bool connection_tl_required = false; + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + connection_tl_required |= + mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_component.modules[i]->am_tl); + connection_tl_required |= + mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_component.modules[i]->rdma_tl); + if (connection_tl_required) { + break; + } + } + + if (!connection_tl_required) { + return OPAL_SUCCESS; + } + + mca_btl_uct_md_t *md; + OPAL_LIST_FOREACH(md, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { + mca_btl_uct_tl_t *tl, *next; + OPAL_LIST_FOREACH_SAFE(tl, next, &md->tls, mca_btl_uct_tl_t) { + if (mca_btl_uct_tl_supports_conn(tl)) { + break; + } + tl = NULL; + } + + if ((opal_list_item_t *) tl == &md->tls.opal_list_sentinel) { + BTL_VERBOSE(("No suitable connection tls in md %s", md->md_name)); + continue; + } + + if (NULL == mca_btl_uct_component.conn_tl) { + mca_btl_uct_component.conn_tl = tl; + } + + if (tl != NULL && (md->connection_only_domain || NULL == mca_btl_uct_component.conn_tl)) { + mca_btl_uct_component.conn_tl = tl; + if (md->connection_only_domain) { + /* not going do to better */ + break; + } + } + } + + if (NULL == mca_btl_uct_component.conn_tl) { + /* no connection tl found, will need to disable all connect-to-endpoint modules */ + BTL_VERBOSE(("could not find a suitable transport to support forming connections")); + return OPAL_ERR_NOT_FOUND; + } + + BTL_VERBOSE(("using transport %s::%s for connection management", + mca_btl_uct_component.conn_tl->uct_md->md_name, + mca_btl_uct_component.conn_tl->uct_tl_name)); + + return mca_btl_uct_enable_tl_conn(mca_btl_uct_component.conn_tl); +} + +int mca_btl_uct_component_filter_mds(void) +{ + int usable_module_count = mca_btl_uct_component.module_count; + /* clean out all unused mds, tls, and unusable modules */ + if (NULL == mca_btl_uct_component.conn_tl) { + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; + if (!(mca_btl_uct_tl_requires_connection_tl(module->am_tl) || + mca_btl_uct_tl_requires_connection_tl(module->rdma_tl))) { + continue; + } + + /* module is unusable */ + mca_btl_uct_finalize(&module->super); + mca_btl_uct_component.modules[i] = NULL; + --usable_module_count; + } + } + + mca_btl_uct_md_t *md, *md_next; + OPAL_LIST_FOREACH_SAFE(md, md_next, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { + mca_btl_uct_module_t *module = NULL; + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + module = mca_btl_uct_component.modules[i]; + if (NULL != module && module->md == md) { + break; + } + module = NULL; + } + + mca_btl_uct_tl_t *tl, *next; + OPAL_LIST_FOREACH_SAFE(tl, next, &md->tls, mca_btl_uct_tl_t) { + if (tl == mca_btl_uct_component.conn_tl || (NULL != module && + (tl == module->rdma_tl || + tl == module->am_tl))) { + /* tl is in use */ + continue; + } + opal_list_remove_item(&md->tls, &tl->super); + OBJ_RELEASE(tl); + } + + if (opal_list_get_size(&md->tls) == 0) { + opal_list_remove_item(&mca_btl_uct_component.md_list, &md->super); + OBJ_RELEASE(md); + } + } + + /* remove holes in the module array */ + if (usable_module_count < mca_btl_uct_component.module_count) { + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + if (mca_btl_uct_component.modules[i] == NULL) { + for (int j = i ; j < mca_btl_uct_component.module_count ; ++j) { + mca_btl_uct_component.modules[i++] = mca_btl_uct_component.modules[j]; + } + } + } + mca_btl_uct_component.module_count = usable_module_count; + } + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/uct/btl_uct_discover.h b/opal/mca/btl/uct/btl_uct_discover.h new file mode 100644 index 00000000000..08b03899fc4 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_discover.h @@ -0,0 +1,43 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(MCA_BTL_UCT_DISCOVER_H) +#define MCA_BTL_UCT_DISCOVER_H + +#include "btl_uct.h" +#include "opal/class/opal_list.h" + +/** + * @brief Query UCT for the available memory domains. This list will be limited by + */ +int mca_btl_uct_component_discover_mds(void); + +/** + * @brief Create BTL modules from the memory domain list. + * + * The modules are registered with MCA and must be shut down using + * mca_btl_module_finalize. + */ +int mca_btl_uct_component_generate_modules(opal_list_t *md_list); + +int mca_btl_uct_enable_modules(mca_btl_uct_module_t **modules, int module_count); + +/** + * @brief Scan detected transports and find a connection transport (if needed). + */ +int mca_btl_uct_component_maybe_setup_conn_tl(void); + +/** + * @brief Clean out unused memory domains and transport layers. + */ +int mca_btl_uct_component_filter_mds(void); + + +#endif /* !defined(MCA_BTL_UCT_DISCOVER_H) */ diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c index 695fd754aa2..7dd7f5d3699 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.c +++ b/opal/mca/btl/uct/btl_uct_endpoint.c @@ -16,6 +16,7 @@ #include "btl_uct.h" #include "btl_uct_am.h" #include "btl_uct_device_context.h" +#include "btl_uct_modex.h" #include "opal/mca/timer/base/base.h" #include "opal/util/proc.h" @@ -24,7 +25,7 @@ static void mca_btl_uct_endpoint_construct(mca_btl_uct_endpoint_t *endpoint) memset(endpoint->uct_eps, 0, sizeof(endpoint->uct_eps[0]) * mca_btl_uct_component.num_contexts_per_module); endpoint->conn_ep = NULL; - OBJ_CONSTRUCT(&endpoint->ep_lock, opal_recursive_mutex_t); + OBJ_CONSTRUCT(&endpoint->ep_lock, opal_mutex_t); } static void mca_btl_uct_endpoint_destruct(mca_btl_uct_endpoint_t *endpoint) @@ -63,53 +64,6 @@ mca_btl_base_endpoint_t *mca_btl_uct_endpoint_create(opal_proc_t *proc) return (mca_btl_base_endpoint_t *) endpoint; } -static unsigned char *mca_btl_uct_process_modex_tl(unsigned char *modex_data) -{ - BTL_VERBOSE( - ("processing modex for tl %s. size: %u", modex_data + 4, *((uint32_t *) modex_data))); - - /* skip size and name */ - return modex_data + 4 + strlen((char *) modex_data + 4) + 1; -} - -static void mca_btl_uct_process_modex(mca_btl_uct_module_t *uct_btl, unsigned char *modex_data, - unsigned char **rdma_tl_data, unsigned char **am_tl_data, - unsigned char **conn_tl_data) -{ - BTL_VERBOSE(("processing remote modex data")); - - if (uct_btl->rdma_tl) { - BTL_VERBOSE(("modex contains RDMA data")); - if (rdma_tl_data) { - *rdma_tl_data = mca_btl_uct_process_modex_tl(modex_data); - } - modex_data += *((uint32_t *) modex_data); - } else if (rdma_tl_data) { - *rdma_tl_data = NULL; - } - - if (uct_btl->am_tl && uct_btl->am_tl != uct_btl->rdma_tl) { - BTL_VERBOSE(("modex contains active message data")); - if (am_tl_data) { - *am_tl_data = mca_btl_uct_process_modex_tl(modex_data); - } - modex_data += *((uint32_t *) modex_data); - } else if (am_tl_data) { - *am_tl_data = NULL; - } - - if (uct_btl->conn_tl && uct_btl->conn_tl != uct_btl->rdma_tl - && uct_btl->conn_tl != uct_btl->am_tl) { - BTL_VERBOSE(("modex contains connection data")); - if (conn_tl_data) { - *conn_tl_data = mca_btl_uct_process_modex_tl(modex_data); - } - modex_data += *((uint32_t *) modex_data); - } else if (conn_tl_data) { - *conn_tl_data = NULL; - } -} - static inline ucs_status_t mca_btl_uct_ep_create_connected_compat(uct_iface_h iface, uct_device_addr_t *device_addr, uct_iface_addr_t *iface_addr, @@ -150,7 +104,7 @@ static int mca_btl_uct_endpoint_connect_iface(mca_btl_uct_module_t *uct_btl, mca /* easy case. just connect to the interface */ iface_addr = (uct_iface_addr_t *) tl_data; device_addr = (uct_device_addr_t *) ((uintptr_t) iface_addr - + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id) + + tl->uct_iface_attr .iface_addr_len); BTL_VERBOSE(("connecting endpoint to interface")); @@ -164,22 +118,6 @@ static int mca_btl_uct_endpoint_connect_iface(mca_btl_uct_module_t *uct_btl, mca return (UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERROR; } -static void mca_btl_uct_connection_ep_construct(mca_btl_uct_connection_ep_t *ep) -{ - ep->uct_ep = NULL; -} - -static void mca_btl_uct_connection_ep_destruct(mca_btl_uct_connection_ep_t *ep) -{ - if (ep->uct_ep) { - uct_ep_destroy(ep->uct_ep); - ep->uct_ep = NULL; - } -} - -OBJ_CLASS_INSTANCE(mca_btl_uct_connection_ep_t, opal_object_t, mca_btl_uct_connection_ep_construct, - mca_btl_uct_connection_ep_destruct); - struct mca_btl_uct_conn_completion_t { uct_completion_t super; volatile bool complete; @@ -203,24 +141,61 @@ static void mca_btl_uct_endpoint_flush_complete(uct_completion_t *self, ucs_stat } #endif +static void mca_btl_uct_flush_conn_endpoint(mca_btl_uct_connection_ep_t *conn_ep) +{ + mca_btl_uct_device_context_t *conn_tl_context = conn_ep->tl->uct_dev_contexts[0]; + mca_btl_uct_conn_completion_t completion + = {.super = {.count = 1, .func = mca_btl_uct_endpoint_flush_complete}, .complete = false}; + ucs_status_t ucs_status; + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + ucs_status = uct_ep_flush(conn_ep->uct_ep, 0, &completion.super); + }); + if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status) { + /* NTH: I don't know if this path is needed. For some networks we must use a completion. */ + do { + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + ucs_status = uct_ep_flush(conn_ep->uct_ep, 0, NULL); + }); + mca_btl_uct_context_progress(conn_tl_context); + } while (UCS_INPROGRESS == ucs_status); + } else if (UCS_OK != ucs_status) { + do { + mca_btl_uct_context_progress(conn_tl_context); + } while (!completion.complete); + } +} + +static void mca_btl_uct_connection_ep_construct(mca_btl_uct_connection_ep_t *ep) +{ + ep->uct_ep = NULL; + ep->tl = NULL; +} + +static void mca_btl_uct_connection_ep_destruct(mca_btl_uct_connection_ep_t *ep) +{ + if (ep->uct_ep) { + mca_btl_uct_flush_conn_endpoint(ep); + uct_ep_destroy(ep->uct_ep); + ep->uct_ep = NULL; + } +} + +OBJ_CLASS_INSTANCE(mca_btl_uct_connection_ep_t, opal_object_t, mca_btl_uct_connection_ep_construct, + mca_btl_uct_connection_ep_destruct); + static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, - mca_btl_uct_device_context_t *conn_tl_context, mca_btl_uct_conn_req_t *request, size_t request_length) { - mca_btl_uct_conn_completion_t completion - = {.super = {.count = 1, .func = mca_btl_uct_endpoint_flush_complete}, .complete = false}; - ucs_status_t ucs_status; + mca_btl_uct_device_context_t *conn_tl_context = mca_btl_uct_component.conn_tl->uct_dev_contexts[0]; BTL_VERBOSE( ("sending connection request to peer. context id: %d, type: %d, length: %" PRIsize_t, request->context_id, request->type, request_length)); - /* need to drop the lock to avoid hold-and-wait */ - opal_mutex_unlock(&endpoint->ep_lock); - do { + ucs_status_t ucs_status; MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { ucs_status = uct_ep_am_short(endpoint->conn_ep->uct_ep, MCA_BTL_UCT_CONNECT_RDMA, request->type, request, request_length); @@ -233,75 +208,70 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, return OPAL_ERROR; } + /* need to drop the lock to avoid hold-and-wait */ + opal_mutex_unlock(&endpoint->ep_lock); /* some TLs (UD for example) need to be progressed to get resources */ mca_btl_uct_context_progress(conn_tl_context); + opal_mutex_lock(&endpoint->ep_lock); } while (1); - /* for now we just wait for the connection request to complete before continuing */ - ucs_status = uct_ep_flush(endpoint->conn_ep->uct_ep, 0, &completion.super); - if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status) { - /* NTH: I don't know if this path is needed. For some networks we must use a completion. */ - do { - ucs_status = uct_ep_flush(endpoint->conn_ep->uct_ep, 0, NULL); - mca_btl_uct_context_progress(conn_tl_context); - } while (UCS_INPROGRESS == ucs_status); - } else { - do { - mca_btl_uct_context_progress(conn_tl_context); - } while (!completion.complete); - } - - opal_mutex_lock(&endpoint->ep_lock); - return OPAL_SUCCESS; } -static int mca_btl_uct_endpoint_send_connection_data( - mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, - mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, - uint8_t *conn_tl_data, int request_type) +static int mca_btl_uct_endpoint_get_helper_endpoint(mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, + uint8_t *conn_tl_data) { - mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl; - mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; - uct_device_addr_t *device_addr = NULL; - uct_iface_addr_t *iface_addr; - ucs_status_t ucs_status; + if (NULL != endpoint->conn_ep) { + BTL_VERBOSE(("re-using existing connection endpoint")); + OBJ_RETAIN(endpoint->conn_ep); + return OPAL_SUCCESS; + } - assert(NULL != conn_tl); + mca_btl_uct_tl_t *conn_tl = mca_btl_uct_component.conn_tl; - BTL_VERBOSE(("connecting endpoint to remote endpoint")); + BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", + opal_process_name_print(endpoint->ep_proc->proc_name))); - if (NULL == endpoint->conn_ep) { - BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", - opal_process_name_print(endpoint->ep_proc->proc_name))); + uct_iface_addr_t *iface_addr = (uct_iface_addr_t *) conn_tl_data; + uct_device_addr_t *device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data + + conn_tl->uct_iface_attr.iface_addr_len); - iface_addr = (uct_iface_addr_t *) conn_tl_data; - device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data - + MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len); + endpoint->conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); + if (OPAL_UNLIKELY(NULL == endpoint->conn_ep)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } - endpoint->conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); - if (OPAL_UNLIKELY(NULL == endpoint->conn_ep)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } + endpoint->conn_ep->tl = conn_tl; - /* create a temporary endpoint for setting up the rdma endpoint */ - MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + ucs_status_t ucs_status; + mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; + /* create a temporary endpoint for setting up the rdma endpoint */ + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { ucs_status = mca_btl_uct_ep_create_connected_compat(conn_tl_context->uct_iface, device_addr, iface_addr, &endpoint->conn_ep->uct_ep); }); - if (UCS_OK != ucs_status) { - BTL_VERBOSE( - ("could not create an endpoint for forming connection to remote peer. code = %d", - ucs_status)); - return OPAL_ERROR; - } - } else { - OBJ_RETAIN(endpoint->conn_ep); + if (UCS_OK != ucs_status) { + BTL_VERBOSE( + ("could not create an endpoint for forming connection to remote peer. code = %d", + ucs_status)); + return OPAL_ERROR; } + return OPAL_SUCCESS; +} + +static int mca_btl_uct_endpoint_send_connection_data( + mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, + mca_btl_uct_tl_endpoint_t *tl_endpoint, int request_type, int remote_module_index) +{ + ucs_status_t ucs_status; + + BTL_VERBOSE(("connecting endpoint to remote endpoint")); + size_t request_length = sizeof(mca_btl_uct_conn_req_t) - + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len; + + tl->uct_iface_attr.ep_addr_len; mca_btl_uct_conn_req_t *request = alloca(request_length); /* fill in common request parameters */ @@ -309,6 +279,7 @@ static int mca_btl_uct_endpoint_send_connection_data( request->context_id = tl_context->context_id; request->tl_index = tl->tl_index; request->type = request_type; + request->module_index = remote_module_index; /* fill in connection request */ ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr); @@ -322,7 +293,7 @@ static int mca_btl_uct_endpoint_send_connection_data( /* let the remote side know that the connection has been established and * wait for the message to be sent */ - int rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl_context, request, + int rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, request, request_length); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { OBJ_RELEASE(endpoint->conn_ep); @@ -337,9 +308,9 @@ static int mca_btl_uct_endpoint_send_connection_data( } static int mca_btl_uct_endpoint_connect_endpoint( - mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, - mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, - uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr) + mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, + mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data, void *ep_addr, int remote_module_index) { ucs_status_t ucs_status; @@ -367,20 +338,23 @@ static int mca_btl_uct_endpoint_connect_endpoint( if (UCS_OK != ucs_status) { return OPAL_ERROR; } - - mca_btl_uct_endpoint_set_flag(uct_btl, endpoint, tl_context->context_id, tl_endpoint, - MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED); } opal_timer_t now = opal_timer_base_get_usec(); - if ((now - tl_endpoint->last_connection_req) < mca_btl_uct_component.connection_retry_timeout && !ep_addr) { - return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS - : OPAL_ERR_OUT_OF_RESOURCE; + if ((now - tl_endpoint->last_connection_req) > mca_btl_uct_component.connection_retry_timeout || ep_addr) { + int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, endpoint, tl, tl_context, tl_endpoint, + /*request_type=*/!!ep_addr, remote_module_index); + if (OPAL_SUCCESS != rc) { + return rc; + } + } + + if (ep_addr) { + mca_btl_uct_endpoint_set_flag(uct_btl, endpoint, tl_context->context_id, tl_endpoint, + MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED); } - int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, endpoint, tl, tl_context, tl_endpoint, - conn_tl_data, /*request_type=*/!!ep_addr); - return (OPAL_SUCCESS == rc) ? OPAL_ERR_OUT_OF_RESOURCE : rc; + return OPAL_ERR_OUT_OF_RESOURCE; } int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint, @@ -392,9 +366,8 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp : uct_btl->am_tl; mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_tl_context_specific(uct_btl, tl, context_id); - uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data; + uint8_t *conn_tl_data, *tl_data = NULL; mca_btl_uct_modex_t *modex; - uint8_t *modex_data; size_t msg_size; int rc; @@ -410,19 +383,20 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp !!(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags))); opal_mutex_lock(&endpoint->ep_lock); - if (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags) { - opal_mutex_unlock(&endpoint->ep_lock); - /* nothing more to do. someone else completed the connection */ - return OPAL_SUCCESS; - } - - /* dumpicate connection request. nothing to do until the endpoint data is received */ - if (NULL != tl_endpoint->uct_ep && NULL == ep_addr) { - opal_mutex_unlock(&endpoint->ep_lock); - return OPAL_ERR_OUT_OF_RESOURCE; - } do { + if (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags) { + /* nothing more to do. someone else completed the connection */ + rc = OPAL_SUCCESS; + break; + } + + /* dumpicate connection request. nothing to do until the endpoint data is received */ + if (NULL != tl_endpoint->uct_ep && NULL == ep_addr) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + break; + } + /* read the modex. this is done both to start the connection and to process endpoint data */ OPAL_MODEX_RECV(rc, &mca_btl_uct_component.super.btl_version, &endpoint->ep_proc->proc_name, (void **) &modex, &msg_size); @@ -434,45 +408,39 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp BTL_VERBOSE(("received modex of size %lu for proc %s. module count %d", (unsigned long) msg_size, OPAL_NAME_PRINT(endpoint->ep_proc->proc_name), modex->module_count)); - modex_data = modex->data; - - /* look for matching transport in the modex */ - for (int i = 0; i < modex->module_count; ++i) { - uint32_t modex_size = *((uint32_t *) modex_data); - - BTL_VERBOSE( - ("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name)); - - modex_data += 4; - - if (0 != strcmp((char *) modex_data, uct_btl->md_name)) { - /* modex belongs to a different module, skip it and continue */ - modex_data += modex_size - 4; - continue; - } - - modex_data += strlen((char *) modex_data) + 1; - mca_btl_uct_process_modex(uct_btl, modex_data, &rdma_tl_data, &am_tl_data, - &conn_tl_data); + int remote_module_index; + tl_data = mca_btl_uct_find_modex(modex, tl, &remote_module_index); + if (OPAL_UNLIKELY(NULL == tl_data)) { + BTL_ERROR(("could not find modex data for this transport")); + rc = OPAL_ERR_UNREACH; break; } - tl_data = (tl == uct_btl->rdma_tl) ? rdma_tl_data : am_tl_data; + /* connect the endpoint */ + if (mca_btl_uct_tl_requires_connection_tl(tl)) { + conn_tl_data = mca_btl_uct_find_modex(modex, mca_btl_uct_component.conn_tl, + /*remote_module_index=*/NULL); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_ERROR(("could not find modex for connection module")); + break; + } - if (NULL == tl_data) { - opal_mutex_unlock(&endpoint->ep_lock); - return OPAL_ERR_UNREACH; - } - /* connect the endpoint */ - if (!mca_btl_uct_tl_requires_connection_tl(tl)) { - rc = mca_btl_uct_endpoint_connect_iface(uct_btl, tl, tl_context, tl_endpoint, tl_data); + if (NULL == tl_endpoint->uct_ep) { + /* allocate or retain a connection endpoint */ + rc = mca_btl_uct_endpoint_get_helper_endpoint(uct_btl, endpoint, + conn_tl_data); + if (OPAL_SUCCESS != rc) { + break; + } + } + + rc = mca_btl_uct_endpoint_connect_endpoint(uct_btl, endpoint, tl, tl_context, tl_endpoint, + tl_data, ep_addr, remote_module_index); } else { - rc = mca_btl_uct_endpoint_connect_endpoint(uct_btl, endpoint, tl, tl_context, - tl_endpoint, tl_data, conn_tl_data, ep_addr); + rc = mca_btl_uct_endpoint_connect_iface(uct_btl, tl, tl_context, tl_endpoint, tl_data); } - } while (0); opal_mutex_unlock(&endpoint->ep_lock); diff --git a/opal/mca/btl/uct/btl_uct_frag.c b/opal/mca/btl/uct/btl_uct_frag.c index 37eee126654..de2a5ba8792 100644 --- a/opal/mca/btl/uct/btl_uct_frag.c +++ b/opal/mca/btl/uct/btl_uct_frag.c @@ -26,11 +26,25 @@ static void mca_btl_uct_base_frag_constructor(mca_btl_uct_base_frag_t *frag) frag->base.des_segment_count = 1; frag->segments[0].seg_addr.pval = frag->base.super.ptr; - frag->uct_iov.buffer = frag->base.super.ptr; - frag->uct_iov.stride = 0; - frag->uct_iov.count = 1; + /* header */ + frag->uct_iov[0].buffer = &frag->header; + frag->uct_iov[0].length = sizeof(frag->header); + frag->uct_iov[0].stride = 0; + frag->uct_iov[0].count = 1; + + /* fragment buffer (reserve with or without data) */ + frag->uct_iov[1].buffer = frag->base.super.ptr; + frag->uct_iov[1].stride = 0; + frag->uct_iov[1].count = 1; + + /* reserved for user data */ + frag->uct_iov[2].buffer = NULL; + frag->uct_iov[2].stride = 0; + frag->uct_iov[2].length = 0; + frag->uct_iov[2].count = 1; + frag->uct_iov_count = 1; if (reg) { - frag->uct_iov.memh = reg->uct_memh; + frag->uct_iov[1].memh = reg->uct_memh; } } diff --git a/opal/mca/btl/uct/btl_uct_include_list.c b/opal/mca/btl/uct/btl_uct_include_list.c new file mode 100644 index 00000000000..5e989581612 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_include_list.c @@ -0,0 +1,78 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024-2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include + +#include "opal_config.h" + +#include "btl_uct_include_list.h" +#include "btl_uct_types.h" +#include "opal/class/opal_object.h" +#include "opal/mca/btl/base/btl_base_error.h" +#include "opal/util/argv.h" + +void mca_btl_uct_include_list_parse (const char *value, mca_btl_uct_include_list_t *list) { + list->list = NULL; + list->include = true; + + if (value == NULL) { + return; + } + + if (value[0] == '^') { + list->include = false; + value++; + } + + list->list = opal_argv_split(value, ','); +} + +int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list) { + if (list->list == NULL) { + return -1; + } + + for (int i = 0; list->list[i]; ++i) { + regex_t preg; + + BTL_VERBOSE(("evaluating %s vs %s-list item %s", name, list->include ? "include" : "exclude", list->list[i])); + int rc = regcomp(&preg, list->list[i], REG_ICASE); + if (0 != rc) { + char errbuf[256]; + regerror(rc, &preg, errbuf, sizeof(errbuf)); + BTL_ERROR(("when matching name, could not parse regular expression: %s, error: %s", list->list[i], errbuf)); + continue; + } + + int result = regexec(&preg, name, /*nmatch=*/0, /*pmatch=*/NULL, /*eflags=*/0); + regfree(&preg); + if (0 == result) { + return list->include ? i + 1 : -(i + 1); + } + } + + return list->include ? -1 : 1; +} + +static void mca_btl_uct_include_list_construct (mca_btl_uct_include_list_t *list) +{ + list->list = NULL; +} + +static void mca_btl_uct_include_list_destruct (mca_btl_uct_include_list_t *list) +{ + opal_argv_free (list->list); + list->list = NULL; +} + +OBJ_CLASS_INSTANCE(mca_btl_uct_include_list_t, opal_object_t, mca_btl_uct_include_list_construct, + mca_btl_uct_include_list_destruct); + + diff --git a/opal/mca/btl/uct/btl_uct_include_list.h b/opal/mca/btl/uct/btl_uct_include_list.h new file mode 100644 index 00000000000..69fba979d8d --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_include_list.h @@ -0,0 +1,34 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024-2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_uct_types.h" + +#if !defined(BTL_UCT_INCLUDE_LIST_H) +#define BTL_UCT_INCLUDE_LIST_H + +/** + * @brief Parse `value` to create an include list. + * + * @param[in] value Comma-delimeted string to parse. + * @param[in,out] list Include list object, must already be constructed. + */ +void mca_btl_uct_include_list_parse (const char *value, mca_btl_uct_include_list_t *list); + +/** + * @brief Find the rank of `name` in the include list `list`. + * + * @param[in] name name to find + * @param[in] list list to search + * + * A negative result means the name is not present or the list is negated. + */ +int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list); + +#endif /* !defined(BTL_UCT_INCLUDE_LIST_H) */ diff --git a/opal/mca/btl/uct/btl_uct_modex.c b/opal/mca/btl/uct/btl_uct_modex.c new file mode 100644 index 00000000000..7d6aa2f5450 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_modex.c @@ -0,0 +1,198 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. + * Copyright (c) 2018-2024 Triad National Security, LLC. All rights + * reserved. + * Copyright (c) 2019-2025 Google, LLC. All rights reserved. + * Copyright (c) 2019 Intel, Inc. All rights reserved. + * Copyright (c) 2022 IBM Corporation. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include "btl_uct_modex.h" +#include "btl_uct_types.h" +#include "btl_uct_device_context.h" +#include "opal/class/opal_list.h" +#include "opal/mca/pmix/pmix-internal.h" + +static uint16_t mca_btl_uct_tl_modex_size(mca_btl_uct_tl_t *tl) +{ + uint16_t size = sizeof(mca_btl_uct_tl_modex_t); + + if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { + size += (uint16_t)tl->uct_iface_attr.iface_addr_len; + } + + /* pad out to a multiple of 4 bytes */ + return (3 + size + (uint16_t)tl->uct_iface_attr.device_addr_len) & ~3; +} + +static uint16_t mca_btl_uct_md_modex_size(mca_btl_uct_md_t *md) +{ + uint16_t modex_size = sizeof(mca_btl_uct_md_modex_t); + + mca_btl_uct_tl_t *tl; + OPAL_LIST_FOREACH(tl, &md->tls, mca_btl_uct_tl_t) { + modex_size += mca_btl_uct_tl_modex_size(tl); + } + + return modex_size; +} + +static uint8_t *mca_btl_uct_tl_modex_pack(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, + uint8_t *modex_data) +{ + mca_btl_uct_device_context_t *dev_context = + mca_btl_uct_module_get_tl_context_specific(module, tl, /*context_id=*/0); + + mca_btl_uct_tl_modex_t *tl_modex = (mca_btl_uct_tl_modex_t *)modex_data; + tl_modex->size = mca_btl_uct_tl_modex_size(tl); + + memset(tl_modex->tl_name, 0, sizeof(tl_modex->tl_name)); + strncpy(tl_modex->tl_name, tl->uct_tl_name, sizeof(tl_modex->tl_name)); + + uint8_t *tl_modex_data = (uint8_t *) tl_modex->data; + + /* NTH: only the first context is available. i assume the device addresses of the + * contexts will be the same but they will have different iface addresses. i also + * am assuming that it doesn't really matter if all remote contexts connect to + * the same endpoint since we are only doing RDMA. if any of these assumptions are + * wrong then we can't delay creating the other contexts and must include their + * information in the modex. */ + if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { + uct_iface_get_address(dev_context->uct_iface, (uct_iface_addr_t *) tl_modex_data); + tl_modex_data += tl->uct_iface_attr.iface_addr_len; + } + + uct_iface_get_device_address(dev_context->uct_iface, (uct_device_addr_t *) tl_modex_data); + tl_modex_data += tl->uct_iface_attr.device_addr_len; + + return modex_data + tl_modex->size; +} + +static uint8_t *mca_btl_uct_modex_pack(mca_btl_uct_md_t *md, uint8_t *modex_data) +{ + mca_btl_uct_module_t *module = NULL; + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + if (mca_btl_uct_component.modules[i]->md == md) { + module = mca_btl_uct_component.modules[i]; + break; + } + } + + mca_btl_uct_md_modex_t *md_modex = (mca_btl_uct_md_modex_t *)modex_data; + modex_data = md_modex->data; + + md_modex->size = mca_btl_uct_md_modex_size(md); + md_modex->module_index = module ? module->module_index : (uint16_t) -1; + + memset(md_modex->md_name, 0, sizeof(md_modex->md_name)); + strncpy(md_modex->md_name, md->md_name, sizeof(md_modex->md_name)); + + mca_btl_uct_tl_t *tl; + OPAL_LIST_FOREACH(tl, &md->tls, mca_btl_uct_tl_t) { + modex_data = mca_btl_uct_tl_modex_pack(module, tl, modex_data); + } + + return modex_data; +} + +int mca_btl_uct_component_modex_send(void) +{ + size_t modex_size = sizeof(mca_btl_uct_modex_t); + mca_btl_uct_modex_t *modex; + uint8_t *modex_data; + int rc; + + mca_btl_uct_md_t *md; + OPAL_LIST_FOREACH(md, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { + modex_size += mca_btl_uct_md_modex_size(md); + } + + modex = alloca(modex_size); + modex_data = modex->data; + + modex->module_count = opal_list_get_size(&mca_btl_uct_component.md_list); + OPAL_LIST_FOREACH(md, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { + modex_data = mca_btl_uct_modex_pack(md, modex_data); + } + + OPAL_MODEX_SEND(rc, PMIX_GLOBAL, &mca_btl_uct_component.super.btl_version, modex, modex_size); + return rc; +} + +static uint8_t *mca_btl_uct_find_tl_modex(mca_btl_uct_md_modex_t *md_modex, mca_btl_uct_tl_t *tl) +{ + uint8_t *modex_data = md_modex->data; + + for (uint16_t modex_offset = 0 ; modex_offset < md_modex->size ; ){ + mca_btl_uct_tl_modex_t *tl_modex = (mca_btl_uct_tl_modex_t *)(modex_data + modex_offset); + + BTL_VERBOSE(("found modex for tl %s searching for %s", tl_modex->tl_name, tl->uct_tl_name)); + + if (0 == strcmp(tl->uct_tl_name, tl_modex->tl_name)) { + return tl_modex->data; + } + + BTL_VERBOSE(("no match, continuing")); + + modex_offset += tl_modex->size; + } + + return NULL; +} + +uint8_t *mca_btl_uct_find_modex(mca_btl_uct_modex_t *modex, mca_btl_uct_tl_t *tl, int *remote_module_index) { + uint8_t *modex_data = modex->data; + + /* look for matching transport in the modex */ + for (int i = 0; i < modex->module_count; ++i) { + mca_btl_uct_md_modex_t *md_modex = (mca_btl_uct_md_modex_t *)modex_data; + + BTL_VERBOSE(("found modex for md %s (remote module index %hu), searching for %s", + md_modex->md_name, md_modex->module_index, tl->uct_md->md_name)); + + if (0 != strcmp(tl->uct_md->md_name, md_modex->md_name)) { + /* modex belongs to a different module, skip it and continue */ + modex_data += md_modex->size; + continue; + } + + uint8_t *tl_modex = mca_btl_uct_find_tl_modex(md_modex, tl); + if (NULL == tl_modex) { + break; + } + + if (NULL != remote_module_index) { + *remote_module_index = md_modex->module_index; + } + + BTL_VERBOSE(("finished processing modex for %s", tl->uct_md->md_name)); + + return tl_modex; + } + + BTL_ERROR(("could not find modex for %s::%s", tl->uct_md->md_name, tl->uct_tl_name)); + + return NULL; +} diff --git a/opal/mca/btl/uct/btl_uct_modex.h b/opal/mca/btl/uct/btl_uct_modex.h new file mode 100644 index 00000000000..e202bc8113f --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_modex.h @@ -0,0 +1,20 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(MCA_BTL_UCT_MODEX_H) +#define MCA_BTL_UCT_MODEX_H + +#include "btl_uct.h" + +int mca_btl_uct_component_modex_send(void); + +uint8_t *mca_btl_uct_find_modex(mca_btl_uct_modex_t *modex, mca_btl_uct_tl_t *tl, int *remote_module_index); + +#endif /* !defined(MCA_BTL_UCT_MODEX_H) */ diff --git a/opal/mca/btl/uct/btl_uct_module.c b/opal/mca/btl/uct/btl_uct_module.c index 9577d615b92..9914c5e8f99 100644 --- a/opal/mca/btl/uct/btl_uct_module.c +++ b/opal/mca/btl/uct/btl_uct_module.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2020 Google, LLC. All rights reserved. + * Copyright (c) 2020-2025 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -90,7 +90,7 @@ static int mca_btl_uct_add_procs(mca_btl_base_module_t *btl, size_t nprocs, if (am_tl) { rc = opal_free_list_init(&uct_module->short_frags, sizeof(mca_btl_uct_base_frag_t), opal_cache_line_size, OBJ_CLASS(mca_btl_uct_base_frag_t), - MCA_BTL_UCT_TL_ATTR(am_tl, 0).cap.am.max_short, + am_tl->uct_iface_attr.cap.am.max_short, opal_cache_line_size, 0, 1024, 64, NULL, 0, NULL, NULL, NULL); rc = opal_free_list_init(&uct_module->eager_frags, sizeof(mca_btl_uct_base_frag_t), @@ -264,6 +264,35 @@ int mca_btl_uct_dereg_mem(void *reg_data, mca_rcache_base_registration_t *reg) return OPAL_SUCCESS; } +mca_btl_uct_module_t *mca_btl_uct_alloc_module(mca_btl_uct_md_t *md, + size_t registration_size) +{ + mca_btl_uct_module_t *module; + + module = malloc(sizeof(*module)); + if (NULL == module) { + return NULL; + } + + /* copy the module template */ + *module = mca_btl_uct_module_template; + + OBJ_CONSTRUCT(&module->id_to_endpoint, opal_hash_table_t); + OBJ_CONSTRUCT(&module->endpoint_lock, opal_mutex_t); + OBJ_CONSTRUCT(&module->short_frags, opal_free_list_t); + OBJ_CONSTRUCT(&module->eager_frags, opal_free_list_t); + OBJ_CONSTRUCT(&module->max_frags, opal_free_list_t); + OBJ_CONSTRUCT(&module->pending_frags, opal_list_t); + OBJ_CONSTRUCT(&module->lock, opal_recursive_mutex_t); + OBJ_CONSTRUCT(&module->allowed_transport_list, mca_btl_uct_include_list_t); + + module->md = md; + OBJ_RETAIN(md); + module->super.btl_registration_handle_size = registration_size; + + return module; +} + /* * Cleanup/release module resources. */ @@ -284,31 +313,32 @@ int mca_btl_uct_finalize(mca_btl_base_module_t *btl) OBJ_DESTRUCT(&uct_module->max_frags); OBJ_DESTRUCT(&uct_module->pending_frags); OBJ_DESTRUCT(&uct_module->lock); - OBJ_DESTRUCT(&uct_module->pending_connection_reqs); + OBJ_DESTRUCT(&uct_module->allowed_transport_list); if (uct_module->rcache) { mca_rcache_base_module_destroy(uct_module->rcache); } - if (NULL != uct_module->am_tl) { - OBJ_RELEASE(uct_module->am_tl); - } - - if (NULL != uct_module->conn_tl) { - OBJ_RELEASE(uct_module->conn_tl); - } + OBJ_DESTRUCT(&uct_module->endpoint_lock); - if (NULL != uct_module->rdma_tl) { - OBJ_RELEASE(uct_module->rdma_tl); + char *tmp; + asprintf(&tmp, "uct_%s", uct_module->md->md_name); + int rc = mca_base_var_group_find("opal", "btl", tmp); + free(tmp); + if (rc >= 0) { + mca_base_var_group_deregister(rc); } - ucs_async_context_destroy(uct_module->ucs_async); - - OBJ_DESTRUCT(&uct_module->endpoint_lock); - - free(uct_module->md_name); + OBJ_RELEASE(uct_module->md); free(uct_module); + for (int i = 0 ; i < MCA_BTL_UCT_MAX_MODULES ; ++i) { + if (mca_btl_uct_component.modules[i] == uct_module) { + mca_btl_uct_component.modules[i] = NULL; + break; + } + } + return OPAL_SUCCESS; } @@ -338,9 +368,11 @@ mca_btl_uct_module_t mca_btl_uct_module_template = { /* set the default flags for this btl. uct provides us with rdma and both * fetching and non-fetching atomics (though limited to add and cswap) */ .btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS - | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION, - .btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP - | MCA_BTL_ATOMIC_SUPPORTS_SWAP | MCA_BTL_ATOMIC_SUPPORTS_32BIT, + | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION | MCA_BTL_FLAGS_SEND, + .btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_AND + | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR + | MCA_BTL_ATOMIC_SUPPORTS_CSWAP | MCA_BTL_ATOMIC_SUPPORTS_SWAP + | MCA_BTL_ATOMIC_SUPPORTS_32BIT, /* set the default limits on put and get */ .btl_put_limit = 1 << 23, @@ -353,22 +385,30 @@ mca_btl_uct_module_t mca_btl_uct_module_template = { .btl_rdma_pipeline_send_length = 8192, .btl_eager_limit = 8192, .btl_max_send_size = 65536, + /* for now we want this component to lose to btl/ugni and btl/vader */ + .btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 1, }}; OBJ_CLASS_INSTANCE(mca_btl_uct_reg_t, opal_free_list_item_t, NULL, NULL); static void mca_btl_uct_md_construct(mca_btl_uct_md_t *md) { + md->uct_component = NULL; md->uct_md = NULL; + md->md_name = NULL; + OBJ_CONSTRUCT(&md->tls, opal_list_t); } static void mca_btl_uct_md_destruct(mca_btl_uct_md_t *md) { + OPAL_LIST_DESTRUCT(&md->tls); + + free(md->md_name); if (md->uct_md) { uct_md_close(md->uct_md); md->uct_md = NULL; } } -OBJ_CLASS_INSTANCE(mca_btl_uct_md_t, opal_object_t, mca_btl_uct_md_construct, +OBJ_CLASS_INSTANCE(mca_btl_uct_md_t, opal_list_item_t, mca_btl_uct_md_construct, mca_btl_uct_md_destruct); diff --git a/opal/mca/btl/uct/btl_uct_rdma.c b/opal/mca/btl/uct/btl_uct_rdma.c index d4210e4631c..e1e8f4b91d9 100644 --- a/opal/mca/btl/uct/btl_uct_rdma.c +++ b/opal/mca/btl/uct/btl_uct_rdma.c @@ -126,7 +126,7 @@ int mca_btl_uct_get(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoin mca_btl_uct_context_lock(context); - if (size <= MCA_BTL_UCT_TL_ATTR(uct_btl->rdma_tl, context->context_id).cap.get.max_bcopy) { + if (size <= uct_btl->rdma_tl->uct_iface_attr.cap.get.max_bcopy) { ucs_status = uct_ep_get_bcopy(ep_handle, mca_btl_uct_get_unpack, local_address, size, remote_address, rkey.rkey, &comp->uct_comp); } else { @@ -223,7 +223,7 @@ int mca_btl_uct_put(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoin /* determine what UCT prototol should be used */ if (size <= uct_btl->super.btl_put_local_registration_threshold) { use_short = size - <= MCA_BTL_UCT_TL_ATTR(uct_btl->rdma_tl, context->context_id).cap.put.max_short; + <= uct_btl->rdma_tl->uct_iface_attr.cap.put.max_short; use_bcopy = !use_short; } diff --git a/opal/mca/btl/uct/btl_uct_rdma.h b/opal/mca/btl/uct/btl_uct_rdma.h index 0438106b2c8..481be991b4d 100644 --- a/opal/mca/btl/uct/btl_uct_rdma.h +++ b/opal/mca/btl/uct/btl_uct_rdma.h @@ -53,7 +53,7 @@ static inline int mca_btl_uct_get_rkey(mca_btl_uct_module_t *module, } # if UCT_API >= UCT_VERSION(1, 7) - ucs_status = uct_rkey_unpack(module->uct_component, (void *) remote_handle, rkey); + ucs_status = uct_rkey_unpack(module->md->uct_component, (void *) remote_handle, rkey); # else ucs_status = uct_rkey_unpack((void *) remote_handle, rkey); # endif @@ -63,7 +63,7 @@ static inline int mca_btl_uct_get_rkey(mca_btl_uct_module_t *module, static inline void mca_btl_uct_rkey_release(mca_btl_uct_module_t *uct_btl, uct_rkey_bundle_t *rkey) { # if UCT_API >= UCT_VERSION(1, 7) - uct_rkey_release(uct_btl->uct_component, rkey); + uct_rkey_release(uct_btl->md->uct_component, rkey); # else (void) uct_btl; uct_rkey_release(rkey); diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index c1ef4c6d727..f55754bc9d8 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -18,12 +18,7 @@ #include "btl_uct_device_context.h" #include "opal/util/argv.h" #include "opal/util/bit_ops.h" - -#if HAVE_DECL_UCT_CB_FLAG_SYNC -# define MCA_BTL_UCT_CB_FLAG_SYNC UCT_CB_FLAG_SYNC -#else -# define MCA_BTL_UCT_CB_FLAG_SYNC 0 -#endif +#include "opal/util/minmax.h" /** * @brief Convert UCT capabilities to BTL flags @@ -70,13 +65,14 @@ static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = { }, }; -static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module) { - uint64_t cap_flags = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags; + mca_btl_uct_tl_t *tl = module->rdma_tl; + uint64_t cap_flags = tl->uct_iface_attr.cap.flags; /* NTH: only use the fetching atomics for now */ - uint64_t atomic_flags32 = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.atomic32.fop_flags; - uint64_t atomic_flags64 = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.atomic64.fop_flags; + uint64_t atomic_flags32 = tl->uct_iface_attr.cap.atomic32.fop_flags; + uint64_t atomic_flags64 = tl->uct_iface_attr.cap.atomic64.fop_flags; uint64_t all_flags = atomic_flags64 | atomic_flags32; @@ -120,9 +116,10 @@ static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = { * * @returns equivalent BTL atomic flags */ -static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module) { - uint64_t cap_flags = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags; + mca_btl_uct_tl_t *tl = module->rdma_tl; + uint64_t cap_flags = tl->uct_iface_attr.cap.flags; module->super.btl_atomic_flags = 0; @@ -144,6 +141,7 @@ static void mca_btl_uct_tl_constructor(mca_btl_uct_tl_t *tl) { memset((void *) ((uintptr_t) tl + sizeof(tl->super)), 0, sizeof(*tl) - sizeof(tl->super)); OBJ_CONSTRUCT(&tl->tl_lock, opal_mutex_t); + OBJ_CONSTRUCT(&tl->pending_connection_reqs, opal_fifo_t); } static void mca_btl_uct_tl_destructor(mca_btl_uct_tl_t *tl) @@ -156,11 +154,10 @@ static void mca_btl_uct_tl_destructor(mca_btl_uct_tl_t *tl) } } - if (tl->uct_md) { - OBJ_RELEASE(tl->uct_md); + if (tl->ucs_async) { + ucs_async_context_destroy(tl->ucs_async); } - free(tl->uct_dev_contexts); free(tl->uct_tl_name); free(tl->uct_dev_name); @@ -169,6 +166,7 @@ static void mca_btl_uct_tl_destructor(mca_btl_uct_tl_t *tl) } OBJ_DESTRUCT(&tl->tl_lock); + OBJ_DESTRUCT(&tl->pending_connection_reqs); } OBJ_CLASS_INSTANCE(mca_btl_uct_tl_t, opal_list_item_t, mca_btl_uct_tl_constructor, @@ -176,14 +174,14 @@ OBJ_CLASS_INSTANCE(mca_btl_uct_tl_t, opal_list_item_t, mca_btl_uct_tl_constructo static ucs_status_t mca_btl_uct_conn_req_cb(void *arg, void *data, size_t length, unsigned flags) { - mca_btl_uct_module_t *module = (mca_btl_uct_module_t *) arg; + mca_btl_uct_tl_t *tl = (mca_btl_uct_tl_t *) arg; mca_btl_uct_pending_connection_request_t *request = calloc(1, length + sizeof(request->super)); /* it is not safe to process the connection request from the callback so just save it for * later processing */ OBJ_CONSTRUCT(request, mca_btl_uct_pending_connection_request_t); memcpy(&request->request_data, (void *) ((intptr_t) data + 8), length); - opal_fifo_push_atomic(&module->pending_connection_reqs, &request->super); + opal_fifo_push_atomic(&tl->pending_connection_reqs, &request->super); return UCS_OK; } @@ -238,17 +236,21 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, return OPAL_SUCCESS; } -static int mca_btl_uct_setup_connection_tl(mca_btl_uct_module_t *module) +static int mca_btl_uct_setup_connection_tl(mca_btl_uct_tl_t *tl) { ucs_status_t ucs_status; - if (NULL == module->conn_tl) { + if (NULL == tl) { return OPAL_ERR_NOT_SUPPORTED; } - ucs_status = uct_iface_set_am_handler(module->conn_tl->uct_dev_contexts[0]->uct_iface, - MCA_BTL_UCT_CONNECT_RDMA, mca_btl_uct_conn_req_cb, module, - UCT_CB_FLAG_ASYNC); + mca_btl_uct_device_context_t *context = + mca_btl_uct_module_get_tl_context_specific(/*module=*/NULL, tl, + /*context_id=*/0); + + ucs_status = uct_iface_set_am_handler(context->uct_iface, + MCA_BTL_UCT_CONNECT_RDMA, mca_btl_uct_conn_req_cb, + tl, UCT_CB_FLAG_ASYNC); if (UCS_OK != ucs_status) { BTL_ERROR(("could not set active message handler for uct tl")); } @@ -256,23 +258,7 @@ static int mca_btl_uct_setup_connection_tl(mca_btl_uct_module_t *module) return UCS_OK == ucs_status ? OPAL_SUCCESS : OPAL_ERROR; } -static void mca_btl_uct_context_enable_progress(mca_btl_uct_device_context_t *context) -{ - if (!context->progress_enabled) { -#if HAVE_DECL_UCT_PROGRESS_THREAD_SAFE - uct_iface_progress_enable(context->uct_iface, - UCT_PROGRESS_THREAD_SAFE | UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); -#else - uct_iface_progress_enable(context->uct_iface, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); -#endif - context->progress_enabled = true; - } -} - -mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *module, - mca_btl_uct_tl_t *tl, int context_id, - bool enable_progress) -{ +static int mca_btl_uct_populate_tl_attr(mca_btl_uct_tl_t *tl) { #if UCT_API >= UCT_VERSION(1, 6) uct_iface_params_t iface_params = {.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE | UCT_IFACE_PARAM_FIELD_DEVICE, @@ -288,96 +274,38 @@ mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *m .mode = {.device = {.tl_name = tl->uct_tl_name, .dev_name = tl->uct_dev_name}}}; #endif - mca_btl_uct_device_context_t *context; ucs_status_t ucs_status; - int rc; - context = calloc(1, sizeof(*context)); - if (OPAL_UNLIKELY(NULL == context)) { - return NULL; - } - - context->context_id = context_id; - context->uct_btl = module; - OBJ_CONSTRUCT(&context->completion_fifo, opal_fifo_t); - OBJ_CONSTRUCT(&context->mutex, opal_recursive_mutex_t); - OBJ_CONSTRUCT(&context->rdma_completions, opal_free_list_t); - - rc = opal_free_list_init(&context->rdma_completions, sizeof(mca_btl_uct_uct_completion_t), - opal_cache_line_size, OBJ_CLASS(mca_btl_uct_uct_completion_t), 0, - opal_cache_line_size, 0, 4096, 128, NULL, 0, NULL, NULL, NULL); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - mca_btl_uct_context_destroy(context); - return NULL; - } - - /* apparently (in contradiction to the spec) UCT is *not* thread safe. because we have to - * use our own locks just go ahead and use UCS_THREAD_MODE_SINGLE. if they ever fix their - * api then change this back to UCS_THREAD_MODE_MULTI and remove the locks around the - * various UCT calls. */ - ucs_status = uct_worker_create(module->ucs_async, UCS_THREAD_MODE_SINGLE, &context->uct_worker); + /* do the bare minimum to get tl attributes */ + uct_worker_h uct_worker; + ucs_status = uct_worker_create(tl->ucs_async, UCS_THREAD_MODE_SINGLE, &uct_worker); if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { BTL_VERBOSE(("could not create a UCT worker")); - mca_btl_uct_context_destroy(context); - return NULL; + return OPAL_ERROR; } - ucs_status = uct_iface_open(tl->uct_md->uct_md, context->uct_worker, &iface_params, - tl->uct_tl_config, &context->uct_iface); + uct_iface_h uct_iface; + ucs_status = uct_iface_open(tl->uct_md->uct_md, uct_worker, &iface_params, + tl->uct_tl_config, &uct_iface); if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { BTL_VERBOSE(("could not open UCT interface. error code: %d", ucs_status)); - mca_btl_uct_context_destroy(context); - return NULL; + uct_worker_destroy(uct_worker); + return OPAL_ERROR; } - /* only need to query one of the interfaces to get the attributes */ - ucs_status = uct_iface_query(context->uct_iface, &context->uct_iface_attr); + int rc = OPAL_SUCCESS; + ucs_status = uct_iface_query(uct_iface, &tl->uct_iface_attr); if (UCS_OK != ucs_status) { BTL_VERBOSE(("Error querying UCT interface")); - mca_btl_uct_context_destroy(context); - return NULL; + rc = OPAL_ERROR; } - if (context_id > 0 && tl == module->am_tl) { - BTL_VERBOSE(("installing AM handler for tl %p context id %d", (void *) tl, context_id)); - uct_iface_set_am_handler(context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler, - context, MCA_BTL_UCT_CB_FLAG_SYNC); - } - - if (enable_progress) { - BTL_VERBOSE(("enabling progress for tl %p context id %d", (void *) tl, context_id)); - mca_btl_uct_context_enable_progress(context); - } - - return context; -} - -void mca_btl_uct_context_destroy(mca_btl_uct_device_context_t *context) -{ - if (context->uct_iface) { - uct_iface_close(context->uct_iface); - context->uct_iface = NULL; - } - - if (context->uct_worker) { - uct_worker_destroy(context->uct_worker); - context->uct_worker = NULL; - } - - OBJ_DESTRUCT(&context->completion_fifo); - OBJ_DESTRUCT(&context->rdma_completions); - free(context); -} - -static int tl_compare(opal_list_item_t **a, opal_list_item_t **b) -{ - mca_btl_uct_tl_t *tl_a = (mca_btl_uct_tl_t *) *a; - mca_btl_uct_tl_t *tl_b = (mca_btl_uct_tl_t *) *b; - - return tl_a->priority - tl_b->priority; + uct_iface_close(uct_iface); + uct_worker_destroy(uct_worker); + return rc; } -static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, +static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_desc, int priority) { mca_btl_uct_tl_t *tl = OBJ_NEW(mca_btl_uct_tl_t); @@ -388,30 +316,29 @@ static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_module_t *module, mca /* initialize btl tl structure */ tl->uct_md = md; - OBJ_RETAIN(md); tl->uct_tl_name = strdup(tl_desc->tl_name); tl->uct_dev_name = strdup(tl_desc->dev_name); + tl->dev_type = tl_desc->dev_type; tl->priority = priority; - tl->uct_dev_contexts = calloc(MCA_BTL_UCT_MAX_WORKERS, sizeof(tl->uct_dev_contexts[0])); - if (NULL == tl->uct_dev_contexts) { + (void) uct_md_iface_config_read(md->uct_md, tl_desc->tl_name, NULL, NULL, &tl->uct_tl_config); + + ucs_status_t ucs_status = ucs_async_context_create(UCS_ASYNC_MODE_THREAD, &tl->ucs_async); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("Could not create a UCT async context")); OBJ_RELEASE(tl); return NULL; } - (void) uct_md_iface_config_read(md->uct_md, tl_desc->tl_name, NULL, NULL, &tl->uct_tl_config); - - /* always create a 0 context (needed to query) */ - tl->uct_dev_contexts[0] = mca_btl_uct_context_create(module, tl, 0, false); - if (NULL == tl->uct_dev_contexts[0]) { - BTL_VERBOSE(("could not create a uct device context")); + int rc = mca_btl_uct_populate_tl_attr(tl); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { OBJ_RELEASE(tl); return NULL; } - BTL_VERBOSE(("Interface CAPS for tl %s::%s: 0x%lx", module->md_name, tl_desc->tl_name, - (unsigned long) MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags)); + BTL_VERBOSE(("Interface CAPS for tl %s::%s::%s 0x%lx", md->md_name, tl_desc->tl_name, + tl_desc->dev_name, (unsigned long) tl->uct_iface_attr.cap.flags)); return tl; } @@ -420,32 +347,32 @@ static void mca_btl_uct_set_tl_rdma(mca_btl_uct_module_t *module, mca_btl_uct_tl { BTL_VERBOSE(("tl %s is suitable for RDMA", tl->uct_tl_name)); - mca_btl_uct_module_set_atomic_flags(module, tl); + module->rdma_tl = tl; + + mca_btl_uct_module_set_atomic_flags(module); - module->super.btl_get_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_zcopy; - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_bcopy) { + module->super.btl_get_limit = opal_min(tl->uct_iface_attr.cap.get.max_zcopy, + module->super.btl_get_limit); + if (tl->uct_iface_attr.cap.get.max_bcopy) { module->super.btl_get_alignment = 0; - module->super.btl_get_local_registration_threshold = MCA_BTL_UCT_TL_ATTR(tl, 0) + module->super.btl_get_local_registration_threshold = tl->uct_iface_attr .cap.get.max_bcopy; } else { /* this is overkill in terms of alignment but we have no way to enforce a minimum get size */ module->super.btl_get_alignment = opal_next_poweroftwo_inclusive( - MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.min_zcopy); + tl->uct_iface_attr.cap.get.min_zcopy); } - module->super.btl_put_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.put.max_zcopy; + module->super.btl_put_limit = opal_min(tl->uct_iface_attr.cap.put.max_zcopy, + module->super.btl_put_limit); module->super.btl_put_alignment = 0; /* no registration needed when using short/bcopy put */ - module->super.btl_put_local_registration_threshold = MCA_BTL_UCT_TL_ATTR(tl, 0) + module->super.btl_put_local_registration_threshold = tl->uct_iface_attr .cap.put.max_bcopy; - module->rdma_tl = tl; - OBJ_RETAIN(tl); - tl->tl_index = (module->am_tl && tl != module->am_tl) ? 1 : 0; - module->comm_tls[tl->tl_index] = tl; if (tl->max_device_contexts <= 1) { tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module; } @@ -454,46 +381,37 @@ static void mca_btl_uct_set_tl_rdma(mca_btl_uct_module_t *module, mca_btl_uct_tl static void mca_btl_uct_set_tl_am(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) { BTL_VERBOSE(("tl %s is suitable for active-messaging", tl->uct_tl_name)); - - if (module->rdma_tl == tl) { - module->shared_endpoints = true; - } module->am_tl = tl; - OBJ_RETAIN(tl); - - uct_iface_set_am_handler(tl->uct_dev_contexts[0]->uct_iface, MCA_BTL_UCT_FRAG, - mca_btl_uct_am_handler, tl->uct_dev_contexts[0], UCT_CB_FLAG_ASYNC); tl->tl_index = (module->rdma_tl && tl != module->rdma_tl) ? 1 : 0; - module->comm_tls[tl->tl_index] = tl; if (tl->max_device_contexts <= 1) { tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module; } - module->super.btl_eager_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.am.max_bcopy - - sizeof(mca_btl_uct_am_header_t); - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_AM_ZCOPY) { - module->super.btl_max_send_size = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.am.max_zcopy - - sizeof(mca_btl_uct_am_header_t); - } else { - module->super.btl_max_send_size = module->super.btl_eager_limit; + size_t max_eager_limit = tl->uct_iface_attr.cap.am.max_bcopy + - sizeof(mca_btl_uct_am_header_t); + size_t max_send_size = max_eager_limit; + + if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_AM_ZCOPY) { + max_send_size = opal_max(max_send_size, tl->uct_iface_attr.cap.am.max_zcopy + - sizeof(mca_btl_uct_am_header_t)); } + + module->super.btl_eager_limit = opal_min(module->super.btl_eager_limit, max_eager_limit); + module->super.btl_max_send_size = opal_min(module->super.btl_max_send_size, max_send_size); } -static int mca_btl_uct_set_tl_conn(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +int mca_btl_uct_enable_tl_conn(mca_btl_uct_tl_t *tl) { int rc; BTL_VERBOSE(("tl %s is suitable for making connections", tl->uct_tl_name)); - module->conn_tl = tl; - rc = mca_btl_uct_setup_connection_tl(module); + rc = mca_btl_uct_setup_connection_tl(tl); if (OPAL_SUCCESS != rc) { return rc; } - OBJ_RETAIN(tl); - if (!tl->max_device_contexts) { /* if a tl is only being used to create connections do not bother with multiple * contexts */ @@ -503,11 +421,9 @@ static int mca_btl_uct_set_tl_conn(mca_btl_uct_module_t *module, mca_btl_uct_tl_ return OPAL_SUCCESS; } -static int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) { - int rc; - - BTL_VERBOSE(("evaluating tl %s", tl->uct_tl_name)); + BTL_VERBOSE(("evaluating tl %s::%s", tl->uct_md->md_name, tl->uct_tl_name)); if (NULL == module->rdma_tl && mca_btl_uct_tl_supports_rdma(tl)) { mca_btl_uct_set_tl_rdma(module, tl); } @@ -516,165 +432,48 @@ static int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_ mca_btl_uct_set_tl_am(module, tl); } - if (NULL == module->conn_tl && mca_btl_uct_tl_supports_conn(tl)) { - rc = mca_btl_uct_set_tl_conn(module, tl); - if (OPAL_SUCCESS != rc) { - return rc; - } - } - if (tl == module->rdma_tl || tl == module->am_tl) { - BTL_VERBOSE(("tl has flags 0x%" PRIx64, MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags)); - module->super.btl_flags |= mca_btl_uct_module_flags(MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags); + BTL_VERBOSE(("tl has flags 0x%" PRIx64, tl->uct_iface_attr.cap.flags)); + module->super.btl_flags |= mca_btl_uct_module_flags(tl->uct_iface_attr.cap.flags); + module->super.btl_flags |= MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION; /* the bandwidth and latency numbers relate to both rdma and active messages. need to * come up with a better estimate. */ /* UCT bandwidth is in bytes/sec, BTL is in MB/sec */ #if UCT_API >= UCT_VERSION(1, 7) - module->super.btl_bandwidth = (uint32_t)((MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.dedicated - + MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.shared + module->super.btl_bandwidth = (uint32_t)((tl->uct_iface_attr.bandwidth.dedicated + + tl->uct_iface_attr.bandwidth.shared / (opal_process_info.num_local_peers + 1)) / 1048576.0); #else - module->super.btl_bandwidth = (uint32_t)(MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth / 1048576.0); + module->super.btl_bandwidth = (uint32_t)(tl->uct_iface_attr.bandwidth / 1048576.0); #endif /* TODO -- figure out how to translate UCT latency to us */ module->super.btl_latency = 1; } - if (tl == module->rdma_tl || tl == module->am_tl || tl == module->conn_tl) { - /* make sure progress is enabled on the default context now that we know this TL will be - * used */ - mca_btl_uct_context_enable_progress(tl->uct_dev_contexts[0]); - } - return OPAL_SUCCESS; } -int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, - uct_tl_resource_desc_t *tl_descs, unsigned tl_count) +int mca_btl_uct_populate_tls(mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count) { - bool include = true, any = false; - mca_btl_uct_tl_t *tl; - opal_list_t tl_list; - char **tl_filter; - int any_priority = 0; - - OBJ_CONSTRUCT(&tl_list, opal_list_t); - - tl_filter = opal_argv_split(mca_btl_uct_component.allowed_transports, ','); - - if ('^' == tl_filter[0][0]) { - /* user has negated the include list */ - char *tmp = strdup(tl_filter[0] + 1); - - free(tl_filter[0]); - tl_filter[0] = tmp; - include = false; - } - - /* check for the any keyword */ - for (unsigned j = 0; tl_filter[j]; ++j) { - if (0 == strcmp(tl_filter[j], "any")) { - any_priority = j; - any = true; - break; - } - } - - if (any && !include) { - opal_argv_free(tl_filter); - return OPAL_ERR_NOT_AVAILABLE; - } + BTL_VERBOSE(("processing %u tls in memory domain %s", tl_count, md->md_name)); for (unsigned i = 0; i < tl_count; ++i) { - bool try_tl = any; - int priority = any_priority; - - for (unsigned j = 0; tl_filter[j]; ++j) { - if (0 == strcmp(tl_filter[j], tl_descs[i].tl_name)) { - try_tl = include; - priority = j; - break; - } - } - - BTL_VERBOSE(("tl filter: tl_name = %s, use = %d, priority = %d", tl_descs[i].tl_name, - try_tl, priority)); - - if (!try_tl) { - continue; - } - - if (0 == strcmp(tl_descs[i].tl_name, "ud")) { - /* ud looks like any normal transport but we do not want to use it for anything other - * than connection management so ensure it gets evaluated last */ - priority = INT_MAX; - } - - tl = mca_btl_uct_create_tl(module, md, tl_descs + i, priority); + BTL_VERBOSE(("processing tl %s::%s::%s", md->md_name, tl_descs[i].tl_name, tl_descs[i].dev_name)); + /* the priority will be set during module creation */ + mca_btl_uct_tl_t *tl = mca_btl_uct_create_tl(md, tl_descs + i, /*priority=*/0); if (tl) { - opal_list_append(&tl_list, &tl->super); + opal_list_append(&md->tls, &tl->super); } } - opal_argv_free(tl_filter); - - if (0 == opal_list_get_size(&tl_list)) { + if (0 == opal_list_get_size(&md->tls)) { BTL_VERBOSE(("no suitable tls match filter: %s", mca_btl_uct_component.allowed_transports)); - OBJ_DESTRUCT(&tl_list); return OPAL_ERR_NOT_AVAILABLE; } - opal_list_sort(&tl_list, tl_compare); - - OPAL_LIST_FOREACH (tl, &tl_list, mca_btl_uct_tl_t) { - mca_btl_uct_evaluate_tl(module, tl); - if (NULL != module->am_tl && NULL != module->rdma_tl - && (NULL != module->conn_tl - || !(mca_btl_uct_tl_requires_connection_tl(module->am_tl) - || mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)))) { - /* all done */ - break; - } - } - - if (NULL == module->rdma_tl) { - /* no rdma tls */ - BTL_VERBOSE(("no rdma tl matched supplied filter. disabling RDMA support")); - - module->super.btl_flags &= ~MCA_BTL_FLAGS_RDMA; - module->super.btl_put = NULL; - module->super.btl_get = NULL; - module->super.btl_atomic_fop = NULL; - module->super.btl_atomic_op = NULL; - } - - if (NULL == module->am_tl) { - /* no active message tls == no send/recv */ - BTL_VERBOSE(("no active message tl matched supplied filter. disabling send/recv support")); - - module->super.btl_send = NULL; - module->super.btl_sendi = NULL; - module->super.btl_alloc = NULL; - module->super.btl_free = NULL; - } - - OPAL_LIST_DESTRUCT(&tl_list); - - if (!(NULL != module->am_tl && mca_btl_uct_tl_requires_connection_tl(module->am_tl)) - && !(NULL != module->rdma_tl && mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)) - && module->conn_tl) { - /* no connection tl needed for selected transports */ - OBJ_RELEASE(module->conn_tl); - module->conn_tl = NULL; - } else if (NULL == module->conn_tl) { - BTL_VERBOSE(("a connection tl is required but no tls match the filter %s", - mca_btl_uct_component.allowed_transports)); - return OPAL_ERROR; - } - return OPAL_SUCCESS; } diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index b2bac61be61..15bb527aad5 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -10,17 +10,23 @@ * $HEADER$ */ +#include + #if !defined(BTL_UCT_TYPES_H) # define BTL_UCT_TYPES_H # include "opal/mca/btl/btl.h" +#include "opal/class/opal_fifo.h" +#include "opal/class/opal_list.h" +#include "opal/class/opal_object.h" #include "opal/mca/timer/base/base.h" /* forward declarations */ struct mca_btl_uct_module_t; struct mca_btl_base_endpoint_t; struct mca_btl_uct_base_frag_t; +struct mca_btl_uct_tl_t; /* TL endpoint flags */ /** connection data was received */ @@ -64,10 +70,27 @@ typedef struct mca_btl_uct_modex_t mca_btl_uct_modex_t; */ struct mca_btl_uct_md_t { /** make this an opal object */ - opal_object_t super; + opal_list_item_t super; + + /** if true none of the tls in this domain will be used + * for communication */ + bool connection_only_domain; + + /** name of the memory domain backing this module */ + char *md_name; + + /** list of mca_btl_uct_tl_t's for this memory domain */ + opal_list_t tls; /** UCT memory domain handle */ uct_md_h uct_md; + + /** memory domain attributes */ + uct_md_attr_t md_attr; + +#if UCT_API >= UCT_VERSION(1, 7) + uct_component_h uct_component; +#endif }; typedef struct mca_btl_uct_md_t mca_btl_uct_md_t; @@ -90,6 +113,9 @@ struct mca_btl_uct_conn_req_t { /** transport index that should be connected */ int tl_index; + /** module that is being connected (local index to the receiver) */ + int module_index; + /** endpoint address data */ uint8_t ep_addr[]; }; @@ -119,6 +145,8 @@ struct mca_btl_uct_connection_ep_t { /** opal base object */ opal_object_t super; + struct mca_btl_uct_tl_t *tl; + /** UCT endpoint used for connection */ uct_ep_h uct_ep; }; @@ -151,9 +179,6 @@ struct mca_btl_uct_device_context_t { /** UCT interface handle */ uct_iface_h uct_iface; - /** interface attributes */ - uct_iface_attr_t uct_iface_attr; - /** RDMA completions */ opal_free_list_t rdma_completions; @@ -164,6 +189,9 @@ struct mca_btl_uct_device_context_t { /** progress is enabled on this context */ bool progress_enabled; + /** communication AM handler is installed */ + bool am_handler_installed; + /** context is in AM callback */ volatile bool in_am_callback; }; @@ -267,7 +295,10 @@ struct mca_btl_uct_base_frag_t { mca_btl_uct_am_header_t header; /** pre-filled UCT io vector */ - uct_iov_t uct_iov; + uct_iov_t uct_iov[3]; + + /** how many iov entries are filled */ + int uct_iov_count; /** completion structure */ mca_btl_uct_uct_completion_t comp; @@ -285,7 +316,7 @@ struct mca_btl_base_endpoint_t { opal_proc_t *ep_proc; /** mutex to protect this structure */ - opal_recursive_mutex_t ep_lock; + opal_mutex_t ep_lock; /** cached connection endpoint */ mca_btl_uct_connection_ep_t *conn_ep; @@ -308,7 +339,7 @@ struct mca_btl_uct_tl_t { /** relative priority 0 == highest */ int priority; - /** memory domain associated with this tl */ + /** memory domain associated with this tl (no reference) */ mca_btl_uct_md_t *uct_md; /** lock protecting tl structures */ @@ -323,22 +354,32 @@ struct mca_btl_uct_tl_t { /** device name for this tl (used for creating device contexts) */ char *uct_dev_name; + /** UCT device type from the tl description */ + uct_device_type_t dev_type; + /** maximum number of device contexts that can be created */ int max_device_contexts; /** array of device contexts */ - mca_btl_uct_device_context_t **uct_dev_contexts; + mca_btl_uct_device_context_t *uct_dev_contexts[MCA_BTL_UCT_MAX_WORKERS]; /** tl index. this is used to differentiate (if there is any difference) * between rdma and am endpoints */ int tl_index; + + /** interface attributes */ + uct_iface_attr_t uct_iface_attr; + + /** async context */ + ucs_async_context_t *ucs_async; + + /** pending connection requests */ + opal_fifo_t pending_connection_reqs; }; typedef struct mca_btl_uct_tl_t mca_btl_uct_tl_t; OBJ_CLASS_DECLARATION(mca_btl_uct_tl_t); -# define MCA_BTL_UCT_TL_ATTR(tl, context_id) (tl)->uct_dev_contexts[(context_id)]->uct_iface_attr - struct mca_btl_uct_pending_connection_request_t { opal_list_item_t super; uint8_t request_data[]; @@ -347,4 +388,36 @@ struct mca_btl_uct_pending_connection_request_t { typedef struct mca_btl_uct_pending_connection_request_t mca_btl_uct_pending_connection_request_t; OBJ_CLASS_DECLARATION(mca_btl_uct_pending_connection_request_t); +/** + * @brief parsed include/exclude list + * + */ +struct mca_btl_uct_include_list_t { + opal_object_t super; + + /** argv-style (NULL terminated) array of strings */ + char **list; + /** is an inclusive list (vs exclusive) */ + bool include; +}; +typedef struct mca_btl_uct_include_list_t mca_btl_uct_include_list_t; +OBJ_CLASS_DECLARATION(mca_btl_uct_include_list_t); + +struct mca_btl_uct_tl_modex_t { + /** total size of this modex */ + uint16_t size; + char tl_name[UCT_TL_NAME_MAX]; + uint8_t data[]; +} __opal_attribute_packed__; +typedef struct mca_btl_uct_tl_modex_t mca_btl_uct_tl_modex_t; + +struct mca_btl_uct_md_modex_t { + /** total size of this modex */ + uint16_t size; + uint16_t module_index; + char md_name[UCT_MD_NAME_MAX]; + uint8_t data[]; +} __opal_attribute_packed__; +typedef struct mca_btl_uct_md_modex_t mca_btl_uct_md_modex_t; + #endif /* !defined(BTL_UCT_TYPES_H) */ From 0e71b93e67502a6614e9a4aadbfdce50245096a3 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Mon, 24 Nov 2025 15:33:57 -0700 Subject: [PATCH 23/51] pmix: advance sha to 632bc703 related to #13493 Signed-off-by: Howard Pritchard --- 3rd-party/openpmix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rd-party/openpmix b/3rd-party/openpmix index e87b2ee832d..632bc703f93 160000 --- a/3rd-party/openpmix +++ b/3rd-party/openpmix @@ -1 +1 @@ -Subproject commit e87b2ee832d249fadd61938a578aaee407b976b9 +Subproject commit 632bc703f9352655de70263313ffb77879ee4e37 From 28e27db7cddd6fcc1c70d679d478bd5c7ed118c4 Mon Sep 17 00:00:00 2001 From: Kento Hasegawa Date: Tue, 25 Nov 2025 21:21:42 +0900 Subject: [PATCH 24/51] op/example fix warnings and error at compilation Signed-off-by: Kento Hasegawa --- ompi/mca/op/example/README.md | 14 +++++++------- ompi/mca/op/example/op_example_component.c | 1 - ompi/mca/op/example/op_example_module_bxor.c | 12 ++++++------ ompi/mca/op/example/op_example_module_max.c | 8 ++++---- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/ompi/mca/op/example/README.md b/ompi/mca/op/example/README.md index 7999cff389b..844e685d41b 100644 --- a/ompi/mca/op/example/README.md +++ b/ompi/mca/op/example/README.md @@ -56,7 +56,7 @@ do: - `op_example_component.c`: The main "component" source file. - `op_example_module.c`: The main "module" source file. - `op_example.h`: information that is shared between the `.c` files. -- `.ompi_ignore`: the presence of this file causes OMPI's `autogen.pl` +- `.opal_ignore`: the presence of this file causes OMPI's `autogen.pl` to skip this component in the configure/build/install process (see below). @@ -69,18 +69,18 @@ shell$ cp -r example foo shell$ cd foo ``` -Remove the `.ompi_ignore` file (which makes the component "visible" to -all developers) *OR* add an `.ompi_unignore` file with one username per +Remove the `.opal_ignore` file (which makes the component "visible" to +all developers) *OR* add an `.opal_unignore` file with one username per line (as reported by `whoami`). OMPI's `autogen.pl` will skip any -component with a `.ompi_ignore` file *unless* there is also an -.ompi_unignore file containing your user ID in it. This is a handy +component with a `.opal_ignore` file *unless* there is also an +`.opal_unignore` file containing your user ID in it. This is a handy mechanism to have a component in the tree but have it not built / used by most other developers: ``` -shell$ rm .ompi_ignore +shell$ rm .opal_ignore *OR* -shell$ whoami > .ompi_unignore +shell$ whoami > .opal_unignore ``` Now rename any file that contains `example` in the filename to have diff --git a/ompi/mca/op/example/op_example_component.c b/ompi/mca/op/example/op_example_component.c index 3faac13fdad..3d3fbade3be 100644 --- a/ompi/mca/op/example/op_example_component.c +++ b/ompi/mca/op/example/op_example_component.c @@ -123,7 +123,6 @@ static char *example_component_version; */ static int example_component_register(void) { - int val; char *str; opal_output(ompi_op_base_framework.framework_output, "example component register"); diff --git a/ompi/mca/op/example/op_example_module_bxor.c b/ompi/mca/op/example/op_example_module_bxor.c index 23a90ede488..fa201b82d88 100644 --- a/ompi/mca/op/example/op_example_module_bxor.c +++ b/ompi/mca/op/example/op_example_module_bxor.c @@ -109,7 +109,7 @@ static OBJ_CLASS_INSTANCE(module_bxor_t, /** * Bxor function for C int */ -static void bxor_int(void *in, void *out, int *count, +static void bxor_int(const void *in, void *out, int *count, ompi_datatype_t **type, ompi_op_base_module_t *module) { module_bxor_t *m = (module_bxor_t*) module; @@ -143,7 +143,7 @@ static void bxor_int(void *in, void *out, int *count, /** * Bxor function for C long */ -static void bxor_long(void *in, void *out, int *count, +static void bxor_long(const void *in, void *out, int *count, ompi_datatype_t **type, ompi_op_base_module_t *module) { module_bxor_t *m = (module_bxor_t*) module; @@ -157,7 +157,7 @@ static void bxor_long(void *in, void *out, int *count, /** * Bxor function for Fortran INTEGER */ -static void bxor_integer(void *in, void *out, int *count, +static void bxor_integer(const void *in, void *out, int *count, ompi_datatype_t **type, ompi_op_base_module_t *module) { module_bxor_t *m = (module_bxor_t*) module; @@ -191,10 +191,10 @@ ompi_op_base_module_t *ompi_op_example_setup_bxor(ompi_op_t *op) (i.e., they're already assigned on the op). */ /* C int */ - module->super.opm_fns[OMPI_OP_BASE_TYPE_INT] = bxor_int; - module->fallback_int = op->o_func.intrinsic.fns[OMPI_OP_BASE_TYPE_INT]; + module->super.opm_fns[OMPI_OP_BASE_TYPE_INT32_T] = bxor_int; + module->fallback_int = op->o_func.intrinsic.fns[OMPI_OP_BASE_TYPE_INT32_T]; module->fallback_int_module = - op->o_func.intrinsic.modules[OMPI_OP_BASE_TYPE_INT]; + op->o_func.intrinsic.modules[OMPI_OP_BASE_TYPE_INT32_T]; /* If you cache a fallback function, you *must* RETAIN (i.e., increase the refcount) its module so that the module knows that it is being used and won't be freed/destructed. */ diff --git a/ompi/mca/op/example/op_example_module_max.c b/ompi/mca/op/example/op_example_module_max.c index 4c43ecf22a4..df6175949df 100644 --- a/ompi/mca/op/example/op_example_module_max.c +++ b/ompi/mca/op/example/op_example_module_max.c @@ -118,7 +118,7 @@ static OBJ_CLASS_INSTANCE(module_max_t, /** * Max function for C float */ -static void max_float(void *in, void *out, int *count, +static void max_float(const void *in, void *out, int *count, ompi_datatype_t **type, ompi_op_base_module_t *module) { module_max_t *m = (module_max_t*) module; @@ -152,7 +152,7 @@ static void max_float(void *in, void *out, int *count, /** * Max function for C double */ -static void max_double(void *in, void *out, int *count, +static void max_double(const void *in, void *out, int *count, ompi_datatype_t **type, ompi_op_base_module_t *module) { module_max_t *m = (module_max_t*) module; @@ -166,7 +166,7 @@ static void max_double(void *in, void *out, int *count, /** * Max function for Fortran REAL */ -static void max_real(void *in, void *out, int *count, +static void max_real(const void *in, void *out, int *count, ompi_datatype_t **type, ompi_op_base_module_t *module) { module_max_t *m = (module_max_t*) module; @@ -180,7 +180,7 @@ static void max_real(void *in, void *out, int *count, /** * Max function for Fortran DOUBLE PRECISION */ -static void max_double_precision(void *in, void *out, int *count, +static void max_double_precision(const void *in, void *out, int *count, ompi_datatype_t **type, ompi_op_base_module_t *module) { From 65be0f9e03f132fea6de669df50cffd34021752f Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Thu, 27 Nov 2025 10:01:00 -0500 Subject: [PATCH 25/51] pml/example: Remove outdated example component This component did not compile any more; it is so old that it uses ptl.h (the PTL framework was removed years ago). Signed-off-by: Jeff Squyres --- ompi/mca/pml/example/.opal_ignore | 0 ompi/mca/pml/example/Makefile.am | 61 -------- ompi/mca/pml/example/pml_example.c | 79 ---------- ompi/mca/pml/example/pml_example.h | 152 ------------------- ompi/mca/pml/example/pml_example_cancel.c | 27 ---- ompi/mca/pml/example/pml_example_component.c | 97 ------------ ompi/mca/pml/example/pml_example_iprobe.c | 47 ------ ompi/mca/pml/example/pml_example_irecv.c | 67 -------- ompi/mca/pml/example/pml_example_isend.c | 50 ------ ompi/mca/pml/example/pml_example_proc.c | 14 -- ompi/mca/pml/example/pml_example_proc.h | 17 --- ompi/mca/pml/example/pml_example_progress.c | 18 --- ompi/mca/pml/example/pml_example_ptl.c | 14 -- ompi/mca/pml/example/pml_example_ptl.h | 17 --- ompi/mca/pml/example/pml_example_recvfrag.c | 21 --- ompi/mca/pml/example/pml_example_recvfrag.h | 21 --- ompi/mca/pml/example/pml_example_recvreq.c | 26 ---- ompi/mca/pml/example/pml_example_recvreq.h | 22 --- ompi/mca/pml/example/pml_example_sendreq.c | 21 --- ompi/mca/pml/example/pml_example_sendreq.h | 21 --- ompi/mca/pml/example/pml_example_start.c | 18 --- 21 files changed, 810 deletions(-) delete mode 100644 ompi/mca/pml/example/.opal_ignore delete mode 100644 ompi/mca/pml/example/Makefile.am delete mode 100644 ompi/mca/pml/example/pml_example.c delete mode 100644 ompi/mca/pml/example/pml_example.h delete mode 100644 ompi/mca/pml/example/pml_example_cancel.c delete mode 100644 ompi/mca/pml/example/pml_example_component.c delete mode 100644 ompi/mca/pml/example/pml_example_iprobe.c delete mode 100644 ompi/mca/pml/example/pml_example_irecv.c delete mode 100644 ompi/mca/pml/example/pml_example_isend.c delete mode 100644 ompi/mca/pml/example/pml_example_proc.c delete mode 100644 ompi/mca/pml/example/pml_example_proc.h delete mode 100644 ompi/mca/pml/example/pml_example_progress.c delete mode 100644 ompi/mca/pml/example/pml_example_ptl.c delete mode 100644 ompi/mca/pml/example/pml_example_ptl.h delete mode 100644 ompi/mca/pml/example/pml_example_recvfrag.c delete mode 100644 ompi/mca/pml/example/pml_example_recvfrag.h delete mode 100644 ompi/mca/pml/example/pml_example_recvreq.c delete mode 100644 ompi/mca/pml/example/pml_example_recvreq.h delete mode 100644 ompi/mca/pml/example/pml_example_sendreq.c delete mode 100644 ompi/mca/pml/example/pml_example_sendreq.h delete mode 100644 ompi/mca/pml/example/pml_example_start.c diff --git a/ompi/mca/pml/example/.opal_ignore b/ompi/mca/pml/example/.opal_ignore deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/ompi/mca/pml/example/Makefile.am b/ompi/mca/pml/example/Makefile.am deleted file mode 100644 index 4c3588848a9..00000000000 --- a/ompi/mca/pml/example/Makefile.am +++ /dev/null @@ -1,61 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2017 IBM Corporation. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_ompi_pml_example_DSO -component_noinst = -component_install = mca_pml_example.la -else -component_noinst = libmca_pml_example.la -component_install = -endif - -local_sources = \ - pml_example.c \ - pml_example.h \ - pml_example_cancel.c \ - pml_example_component.c \ - pml_example_iprobe.c \ - pml_example_irecv.c \ - pml_example_isend.c \ - pml_example_ptl.c \ - pml_example_ptl.h \ - pml_example_proc.c \ - pml_example_proc.h \ - pml_example_progress.c \ - pml_example_recvfrag.c \ - pml_example_recvfrag.h \ - pml_example_recvreq.c \ - pml_example_recvreq.h \ - pml_example_sendreq.c \ - pml_example_sendreq.h \ - pml_example_start.c - -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_pml_example_la_SOURCES = $(local_sources) -mca_pml_example_la_LDFLAGS = -module -avoid-version -mca_pml_example_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la - -noinst_LTLIBRARIES = $(component_noinst) -libmca_pml_example_la_SOURCES = $(local_sources) -libmca_pml_example_la_LDFLAGS = -module -avoid-version - diff --git a/ompi/mca/pml/example/pml_example.c b/ompi/mca/pml/example/pml_example.c deleted file mode 100644 index baabe601e2a..00000000000 --- a/ompi/mca/pml/example/pml_example.c +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2006-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2018 IBM Corporation. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "pml_example.h" -#include "pml_example_recvreq.h" -#include "pml_example_sendreq.h" - -mca_pml_example_t mca_pml_example = { - { - mca_pml_example_add_procs, - mca_pml_example_del_procs, - mca_pml_example_add_ptls, - mca_pml_example_control, - mca_pml_example_progress, - mca_pml_example_add_comm, - mca_pml_example_del_comm, - mca_pml_example_irecv_init, - mca_pml_example_irecv, - mca_pml_example_recv, - mca_pml_example_isend_init, - mca_pml_example_isend, - mca_pml_example_send, - mca_pml_example_iprobe, - mca_pml_example_probe, - mca_pml_example_start, - mca_pml_example_improbe, - mca_pml_example_mprobe, - mca_pml_example_imrecv, - mca_pml_example_mrecv, - - 32768, - (0x7fffffff), - 0 /* flags */ - } -}; - -int mca_pml_example_add_comm(ompi_communicator_t* comm) -{ - return OMPI_SUCCESS; -} - -int mca_pml_example_del_comm(ompi_communicator_t* comm) -{ - return OMPI_SUCCESS; -} - -int mca_pml_example_add_ptls(opal_list_t *ptls) -{ - return OMPI_SUCCESS; -} - -int mca_pml_example_control(int param, void* value, size_t size) -{ - return OMPI_SUCCESS; -} - -int mca_pml_example_add_procs(ompi_proc_t** procs, size_t nprocs) -{ - return OMPI_SUCCESS; -} - -int mca_pml_example_del_procs(ompi_proc_t** procs, size_t nprocs) -{ - return OMPI_SUCCESS; -} diff --git a/ompi/mca/pml/example/pml_example.h b/ompi/mca/pml/example/pml_example.h deleted file mode 100644 index 92fb39c8377..00000000000 --- a/ompi/mca/pml/example/pml_example.h +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2006-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef PML_EXAMPLE_H_HAS_BEEN_INCLUDED -#define PML_EXAMPLE_H_HAS_BEEN_INCLUDED - -#include "ompi/request/request.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/mca/ptl/ptl.h" - -BEGIN_C_DECLS - -struct mca_pml_example_t { - mca_pml_base_module_t super; - - mca_ptl_base_component_t **example_ptl_components; - size_t example_num_ptl_components; - - mca_ptl_base_module_t** example_ptl_modules; - size_t example_num_ptl_modules; - - opal_list_t example_procs; - opal_mutex_t example_lock; - - /* list of pending send requests */ - opal_list_t example_send_pending; -}; -typedef struct mca_pml_example_t mca_pml_example_t; - -extern mca_pml_example_t mca_pml_example; - -/* - * PML interface functions. - */ -extern int mca_pml_example_add_comm( struct ompi_communicator_t* comm ); -extern int mca_pml_example_del_comm( struct ompi_communicator_t* comm ); - -extern int mca_pml_example_add_procs( struct ompi_proc_t **procs, size_t nprocs ); -extern int mca_pml_example_del_procs( struct ompi_proc_t **procs, size_t nprocs ); - -extern int mca_pml_example_add_ptls( opal_list_t *ptls ); - -extern int mca_pml_example_control( int param, void *size, size_t value ); - -extern int mca_pml_example_iprobe( int dst, - int tag, - struct ompi_communicator_t* comm, - int *matched, - ompi_status_public_t* status ); - -extern int mca_pml_example_probe( int dst, - int tag, - struct ompi_communicator_t* comm, - ompi_status_public_t* status ); - -extern int mca_pml_example_improbe(int dst, - int tag, - struct ompi_communicator_t* comm, - int *matched, - struct ompi_message_t **message, - ompi_status_public_t* status); - -extern int mca_pml_example_mprobe(int dst, - int tag, - struct ompi_communicator_t* comm, - struct ompi_message_t **message, - ompi_status_public_t* status); - -extern int mca_pml_example_cancel( ompi_request_t* request ); -extern int mca_pml_example_cancelled( ompi_request_t* request, int *flag ); - -extern int mca_pml_example_isend_init( void *buf, - size_t count, - ompi_datatype_t *datatype, - int dst, - int tag, - mca_pml_base_send_mode_t mode, - struct ompi_communicator_t* comm, - struct ompi_request_t **request ); - -extern int mca_pml_example_isend( void *buf, - size_t count, - ompi_datatype_t *datatype, - int dst, - int tag, - mca_pml_base_send_mode_t mode, - struct ompi_communicator_t* comm, - struct ompi_request_t **request ); - -extern int mca_pml_example_send( void *buf, - size_t count, - ompi_datatype_t *datatype, - int dst, - int tag, - mca_pml_base_send_mode_t mode, - struct ompi_communicator_t* comm ); - -extern int mca_pml_example_irecv_init( void *buf, - size_t count, - ompi_datatype_t *datatype, - int src, - int tag, - struct ompi_communicator_t* comm, - struct ompi_request_t **request ); - -extern int mca_pml_example_irecv( void *buf, - size_t count, - ompi_datatype_t *datatype, - int src, - int tag, - struct ompi_communicator_t* comm, - struct ompi_request_t **request ); - -extern int mca_pml_example_recv( void *buf, - size_t count, - ompi_datatype_t *datatype, - int src, - int tag, - struct ompi_communicator_t* comm, - ompi_status_public_t* status ); - -extern int mca_pml_example_imrecv(void *buf, - size_t count, - ompi_datatype_t *datatype, - struct ompi_message_t **message, - struct ompi_request_t **request); - -extern int mca_pml_example_mrecv(void *buf, - size_t count, - ompi_datatype_t *datatype, - struct ompi_message_t **message, - ompi_status_public_t* status); - -extern int mca_pml_example_progress(void); - -extern int mca_pml_example_start( size_t count, ompi_request_t** requests ); - -END_C_DECLS - -#endif /* PML_EXAMPLE_H_HAS_BEEN_INCLUDED */ diff --git a/ompi/mca/pml/example/pml_example_cancel.c b/ompi/mca/pml/example/pml_example_cancel.c deleted file mode 100644 index 8d162d2b229..00000000000 --- a/ompi/mca/pml/example/pml_example_cancel.c +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "pml_example.h" - -int mca_pml_example_cancel(ompi_request_t* request) -{ - return OMPI_SUCCESS; -} - -int mca_pml_example_cancelled(ompi_request_t* request, int* flag) -{ - if(NULL != flag) - *flag = 0; - return OMPI_SUCCESS; -} - diff --git a/ompi/mca/pml/example/pml_example_component.c b/ompi/mca/pml/example/pml_example_component.c deleted file mode 100644 index 6ee8ff44415..00000000000 --- a/ompi/mca/pml/example/pml_example_component.c +++ /dev/null @@ -1,97 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. - * All Rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "opal/util/event.h" -#include "pml_example.h" - -static int mca_pml_example_component_register(void); -static int mca_pml_example_component_open(void); -static int mca_pml_example_component_close(void); -static mca_pml_base_module_t* mca_pml_example_component_init( int* priority, - bool *allow_multi_user_threads, bool *have_hidden_threads ); -static int mca_pml_example_component_fini(void); - -static int mca_pml_example_priority = 0; - -mca_pml_base_component_2_1_0_t mca_pml_example_component = { - - /* First, the mca_base_component_t struct containing meta - * information about the component itself */ - - .pmlm_version = { - MCA_PML_BASE_VERSION_2_1_0, - - .mca_component_name = "example", - MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION), - .mca_open_component = mca_pml_example_component_open, - .mca_close_component = mca_pml_example_component_close, - .mca_register_component_params = mca_pml_example_component_register, - }, - .pmlm_data = { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - - .pmlm_init = mca_pml_example_component_init, - .pmlm_finalize = mca_pml_example_component_fini, -}; -MCA_BASE_COMPONENT_INIT(ompi, pml, example) - -static int mca_pml_example_component_register(void) -{ - mca_pml_example_priority = 0; - (void) mca_base_component_var_register(&mca_pml_example_component.pmlm_version, - "priority", "Priority of the pml example component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_pml_example_priority); - - return OMPI_SUCCESS; -} - -static int mca_pml_example_component_open(void) -{ - return OMPI_SUCCESS; -} - -static int mca_pml_example_component_close(void) -{ - return OMPI_SUCCESS; -} - -static mca_pml_base_module_t* -mca_pml_example_component_init( int* priority, - bool *allow_multi_user_threads, - bool *have_hidden_threads ) -{ - *priority = mca_pml_example_priority; - *have_hidden_threads = false; - *allow_multi_user_threads &= true; - return &mca_pml_example.super; -} - -static int mca_pml_example_component_fini(void) -{ - return OMPI_SUCCESS; -} - diff --git a/ompi/mca/pml/example/pml_example_iprobe.c b/ompi/mca/pml/example/pml_example_iprobe.c deleted file mode 100644 index d7d76ff7e18..00000000000 --- a/ompi/mca/pml/example/pml_example_iprobe.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "pml_example.h" - -int mca_pml_example_iprobe( int src, int tag, - struct ompi_communicator_t *comm, - int *matched, ompi_status_public_t * status ) -{ - return OMPI_SUCCESS; -} - -int mca_pml_example_probe( int src, int tag, - struct ompi_communicator_t *comm, - ompi_status_public_t * status ) -{ - return OMPI_SUCCESS; -} - -int mca_pml_example_improbe(int dst, - int tag, - struct ompi_communicator_t* comm, - int *matched, - struct ompi_message_t **message, - ompi_status_public_t* status) -{ - return OMPI_SUCCESS; -} - -int mca_pml_example_mprobe(int dst, - int tag, - struct ompi_communicator_t* comm, - struct ompi_message_t **message, - ompi_status_public_t* status) -{ - return OMPI_SUCCESS; -} diff --git a/ompi/mca/pml/example/pml_example_irecv.c b/ompi/mca/pml/example/pml_example_irecv.c deleted file mode 100644 index 4e91e9ea7de..00000000000 --- a/ompi/mca/pml/example/pml_example_irecv.c +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "pml_example.h" -#include "ompi/request/request.h" - -int mca_pml_example_irecv_init( void *addr, - size_t count, - ompi_datatype_t * datatype, - int src, - int tag, - struct ompi_communicator_t *comm, - struct ompi_request_t **request ) -{ - return OMPI_SUCCESS; -} - -int mca_pml_example_irecv( void *addr, - size_t count, - ompi_datatype_t * datatype, - int src, - int tag, - struct ompi_communicator_t *comm, - struct ompi_request_t **request ) -{ - return OMPI_SUCCESS; -} - - -int mca_pml_example_recv( void *addr, - size_t count, - ompi_datatype_t * datatype, - int src, - int tag, - struct ompi_communicator_t *comm, - ompi_status_public_t * status ) -{ - return OMPI_SUCCESS; -} - -int mca_pml_example_imrecv(void *buf, - size_t count, - ompi_datatype_t *datatype, - struct ompi_message_t **message, - struct ompi_request_t **request) -{ - return OMPI_SUCCESS; -} - -int mca_pml_example_mrecv(void *buf, - size_t count, - ompi_datatype_t *datatype, - struct ompi_message_t **message, - ompi_status_public_t* status) -{ - return OMPI_SUCCESS; -} diff --git a/ompi/mca/pml/example/pml_example_isend.c b/ompi/mca/pml/example/pml_example_isend.c deleted file mode 100644 index 61ce9792e36..00000000000 --- a/ompi/mca/pml/example/pml_example_isend.c +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "pml_example.h" - -int mca_pml_example_isend_init( void* buf, - size_t count, - ompi_datatype_t* datatype, - int dst, - int tag, - mca_pml_base_send_mode_t sendmode, - ompi_communicator_t* comm, - ompi_request_t** request ) -{ - return OMPI_SUCCESS; -} - - -int mca_pml_example_isend( void* buf, - size_t count, - ompi_datatype_t* datatype, - int dst, - int tag, - mca_pml_base_send_mode_t sendmode, - ompi_communicator_t* comm, - ompi_request_t** request ) -{ - return OMPI_SUCCESS; -} - -int mca_pml_example_send( void *buf, - size_t count, - ompi_datatype_t* datatype, - int dst, - int tag, - mca_pml_base_send_mode_t sendmode, - ompi_communicator_t* comm ) -{ - return OMPI_SUCCESS; -} - diff --git a/ompi/mca/pml/example/pml_example_proc.c b/ompi/mca/pml/example/pml_example_proc.c deleted file mode 100644 index 229ff65ae8c..00000000000 --- a/ompi/mca/pml/example/pml_example_proc.c +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "pml_example.h" - diff --git a/ompi/mca/pml/example/pml_example_proc.h b/ompi/mca/pml/example/pml_example_proc.h deleted file mode 100644 index d86e2abd762..00000000000 --- a/ompi/mca/pml/example/pml_example_proc.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef PML_EXAMPLE_PROC_H_HAS_BEEN_INCLUDED -#define PML_EXAMPLE_PROC_H_HAS_BEEN_INCLUDED - - - -#endif /* PML_EXAMPLE_PROC_H_HAS_BEEN_INCLUDED */ diff --git a/ompi/mca/pml/example/pml_example_progress.c b/ompi/mca/pml/example/pml_example_progress.c deleted file mode 100644 index 9217a2d74f8..00000000000 --- a/ompi/mca/pml/example/pml_example_progress.c +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "pml_example.h" - -int mca_pml_example_progress(void) -{ - return 0; -} diff --git a/ompi/mca/pml/example/pml_example_ptl.c b/ompi/mca/pml/example/pml_example_ptl.c deleted file mode 100644 index 229ff65ae8c..00000000000 --- a/ompi/mca/pml/example/pml_example_ptl.c +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "pml_example.h" - diff --git a/ompi/mca/pml/example/pml_example_ptl.h b/ompi/mca/pml/example/pml_example_ptl.h deleted file mode 100644 index efb3bc52c74..00000000000 --- a/ompi/mca/pml/example/pml_example_ptl.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef PML_EXAMPLE_PTL_H_HAS_BEEN_INCLUDED -#define PML_EXAMPLE_PTL_H_HAS_BEEN_INCLUDED - - - -#endif /* PML_EXAMPLE_PTL_H_HAS_BEEN_INCLUDED */ diff --git a/ompi/mca/pml/example/pml_example_recvfrag.c b/ompi/mca/pml/example/pml_example_recvfrag.c deleted file mode 100644 index 32340183ae7..00000000000 --- a/ompi/mca/pml/example/pml_example_recvfrag.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "pml_example.h" -#include "pml_example_recvfrag.h" - -bool mca_pml_example_recv_frag_match( mca_ptl_base_module_t* ptl, - mca_ptl_base_recv_frag_t* frag, - mca_ptl_base_match_header_t* header ) -{ - return false; -} diff --git a/ompi/mca/pml/example/pml_example_recvfrag.h b/ompi/mca/pml/example/pml_example_recvfrag.h deleted file mode 100644 index 09f3c3843af..00000000000 --- a/ompi/mca/pml/example/pml_example_recvfrag.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef PML_EXAMPLE_RECVFRAG_H_HAS_BEEN_INCLUDED -#define PML_EXAMPLE_RECVFRAG_H_HAS_BEEN_INCLUDED - -#include "ompi/mca/ptl/base/ptl_base_recvfrag.h" - -bool mca_pml_example_recv_frag_match( mca_ptl_base_module_t* ptl, - mca_ptl_base_recv_frag_t* frag, - mca_ptl_base_match_header_t* header ); - -#endif /* PML_EXAMPLE_RECVFRAG_H_HAS_BEEN_INCLUDED */ diff --git a/ompi/mca/pml/example/pml_example_recvreq.c b/ompi/mca/pml/example/pml_example_recvreq.c deleted file mode 100644 index 0c4efadd695..00000000000 --- a/ompi/mca/pml/example/pml_example_recvreq.c +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "pml_example.h" -#include "pml_example_recvreq.h" - -/* - * Update the recv request status to reflect the number of bytes - * received and actually delivered to the application. - */ - -void mca_pml_example_recv_request_progress( struct mca_ptl_base_module_t* ptl, - mca_pml_base_recv_request_t* req, - size_t bytes_received, - size_t bytes_delivered ) -{ -} diff --git a/ompi/mca/pml/example/pml_example_recvreq.h b/ompi/mca/pml/example/pml_example_recvreq.h deleted file mode 100644 index 6ab3c7cbccf..00000000000 --- a/ompi/mca/pml/example/pml_example_recvreq.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef PML_EXAMPLE_RECVREQ_H_HAS_BEEN_INCLUDED -#define PML_EXAMPLE_RECVREQ_H_HAS_BEEN_INCLUDED - -#include "ompi/mca/pml/base/pml_base_recvreq.h" - -void mca_pml_example_recv_request_progress( struct mca_ptl_base_module_t* ptl, - mca_pml_base_recv_request_t* req, - size_t bytes_received, - size_t bytes_delivered ); - -#endif /* PML_EXAMPLE_RECVREQ_H_HAS_BEEN_INCLUDED */ diff --git a/ompi/mca/pml/example/pml_example_sendreq.c b/ompi/mca/pml/example/pml_example_sendreq.c deleted file mode 100644 index 9f5dea91b1a..00000000000 --- a/ompi/mca/pml/example/pml_example_sendreq.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "pml_example.h" -#include "pml_example_sendreq.h" - -void mca_pml_example_send_request_progress( struct mca_ptl_base_module_t* ptl, - mca_pml_base_send_request_t* req, - size_t bytes_sent ) -{ -} - diff --git a/ompi/mca/pml/example/pml_example_sendreq.h b/ompi/mca/pml/example/pml_example_sendreq.h deleted file mode 100644 index 5e4ef1fd2ba..00000000000 --- a/ompi/mca/pml/example/pml_example_sendreq.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef PML_EXAMPLE_SENDREQ_H_HAS_BEEN_INCLUDED -#define PML_EXAMPLE_SENDREQ_H_HAS_BEEN_INCLUDED - -#include "ompi/mca/pml/base/pml_base_sendreq.h" - -void mca_pml_example_send_request_progress( struct mca_ptl_base_module_t* ptl, - mca_pml_base_send_request_t* req, - size_t bytes_sent ); - -#endif /* PML_EXAMPLE_SENDREQ_H_HAS_BEEN_INCLUDED */ diff --git a/ompi/mca/pml/example/pml_example_start.c b/ompi/mca/pml/example/pml_example_start.c deleted file mode 100644 index e57d7a0e3b8..00000000000 --- a/ompi/mca/pml/example/pml_example_start.c +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "pml_example.h" - -int mca_pml_example_start(size_t count, ompi_request_t** requests) -{ - return OMPI_SUCCESS; -} From 9f7dddf582570241f091baba76d6a8530608d21f Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Thu, 27 Nov 2025 10:01:41 -0500 Subject: [PATCH 26/51] vprotocol/example: remove stale example component This component no longer compiles; it refers to multiple header files that do not seem to exist any longer. Signed-off-by: Jeff Squyres --- ompi/mca/vprotocol/example/.opal_ignore | 0 ompi/mca/vprotocol/example/Makefile.am | 48 -------- ompi/mca/vprotocol/example/owner.txt | 7 -- .../mca/vprotocol/example/vprotocol_example.c | 48 -------- .../mca/vprotocol/example/vprotocol_example.h | 96 ---------------- .../example/vprotocol_example_comm.c | 24 ---- .../example/vprotocol_example_component.c | 106 ------------------ .../example/vprotocol_example_probe.c | 29 ----- .../example/vprotocol_example_proc.c | 25 ----- .../example/vprotocol_example_progress.c | 24 ---- .../example/vprotocol_example_recv.c | 66 ----------- .../example/vprotocol_example_send.c | 51 --------- .../example/vprotocol_example_start.c | 19 ---- .../example/vprotocol_example_start.h | 19 ---- .../example/vprotocol_example_wait.c | 31 ----- .../example/vprotocol_example_wait.h | 21 ---- 16 files changed, 614 deletions(-) delete mode 100644 ompi/mca/vprotocol/example/.opal_ignore delete mode 100644 ompi/mca/vprotocol/example/Makefile.am delete mode 100644 ompi/mca/vprotocol/example/owner.txt delete mode 100644 ompi/mca/vprotocol/example/vprotocol_example.c delete mode 100644 ompi/mca/vprotocol/example/vprotocol_example.h delete mode 100644 ompi/mca/vprotocol/example/vprotocol_example_comm.c delete mode 100644 ompi/mca/vprotocol/example/vprotocol_example_component.c delete mode 100644 ompi/mca/vprotocol/example/vprotocol_example_probe.c delete mode 100644 ompi/mca/vprotocol/example/vprotocol_example_proc.c delete mode 100644 ompi/mca/vprotocol/example/vprotocol_example_progress.c delete mode 100644 ompi/mca/vprotocol/example/vprotocol_example_recv.c delete mode 100644 ompi/mca/vprotocol/example/vprotocol_example_send.c delete mode 100644 ompi/mca/vprotocol/example/vprotocol_example_start.c delete mode 100644 ompi/mca/vprotocol/example/vprotocol_example_start.h delete mode 100644 ompi/mca/vprotocol/example/vprotocol_example_wait.c delete mode 100644 ompi/mca/vprotocol/example/vprotocol_example_wait.h diff --git a/ompi/mca/vprotocol/example/.opal_ignore b/ompi/mca/vprotocol/example/.opal_ignore deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/ompi/mca/vprotocol/example/Makefile.am b/ompi/mca/vprotocol/example/Makefile.am deleted file mode 100644 index 64ec3e4cca0..00000000000 --- a/ompi/mca/vprotocol/example/Makefile.am +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright (c) 2004-2007 The Trustees of the University of Tennessee. -# All rights reserved. -# Copyright (c) 2017 IBM Corporation. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# Make the output library in this directory, and name it either -# protocol_.la (for DSO builds) or libprotocol_.la -# (for static builds). - -if MCA_BUILD_ompi_vprotocol_example_DSO -component_noinst = -component_install = mca_vprotocol_example.la -else -component_noinst = libmca_vprotocol_example.la -component_install = -endif - -local_sources = \ - vprotocol_example.h \ - vprotocol_example.c \ - vprotocol_example_component.c \ - vprotocol_example_proc.c \ - vprotocol_example_comm.c \ - vprotocol_example_progress.c \ - vprotocol_example_start.c \ - vprotocol_example_recv.c \ - vprotocol_example_send.c \ - vprotocol_example_probe.c \ - vprotocol_example_wait.h \ - vprotocol_example_wait.c -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_vprotocol_example_la_SOURCES = $(local_sources) -mca_vprotocol_example_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la -mca_vprotocol_example_la_CFLAGS = -mca_vprotocol_example_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_vprotocol_example_la_SOURCES = $(local_sources) -libmca_vprotocol_example_la_LIBADD = -libmca_vprotocol_example_la_CFLAGS = -libmca_vprotocol_example_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/vprotocol/example/owner.txt b/ompi/mca/vprotocol/example/owner.txt deleted file mode 100644 index c47a2d510b1..00000000000 --- a/ompi/mca/vprotocol/example/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: UTK -status: maintenance diff --git a/ompi/mca/vprotocol/example/vprotocol_example.c b/ompi/mca/vprotocol/example/vprotocol_example.c deleted file mode 100644 index 54bc78a3508..00000000000 --- a/ompi/mca/vprotocol/example/vprotocol_example.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of the University of Tennessee. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "vprotocol_example.h" - -mca_vprotocol_example_module_t mca_vprotocol_example = -{ - { - /* mca_pml_base_module_add_procs_fn_t */ mca_vprotocol_example_add_procs, - /* mca_pml_base_module_del_procs_fn_t */ mca_vprotocol_example_del_procs, - /* mca_pml_base_module_enable_fn_t */ mca_vprotocol_example_enable, - /* mca_pml_base_module_progress_fn_t */ mca_vprotocol_example_progress, - - /* mca_pml_base_module_add_comm_fn_t */ mca_vprotocol_example_add_comm, - /* mca_pml_base_module_del_comm_fn_t */ mca_vprotocol_example_del_comm, - /* mca_pml_base_module_irecv_init_fn_t */ mca_vprotocol_example_irecv_init, - /* mca_pml_base_module_irecv_fn_t */ mca_vprotocol_example_irecv, - /* mca_pml_base_module_recv_fn_t */ mca_vprotocol_example_recv, - /* mca_pml_base_module_isend_init_fn_t */ mca_vprotocol_example_isend_init, - /* mca_pml_base_module_isend_fn_t */ mca_vprotocol_example_isend, - /* mca_pml_base_module_send_fn_t */ mca_vprotocol_example_send, - /* mca_pml_base_module_iprobe_fn_t */ mca_vprotocol_example_iprobe, - /* mca_pml_base_module_probe_fn_t */ mca_vprotocol_example_probe, - /* mca_pml_base_module_start_fn_t */ mca_vprotocol_example_start, - - /* mca_pml_base_module_dump_fn_t */ mca_vprotocol_example_dump, - - /* opal_class_t * */ NULL, - }, -/** - * Insert here your own protocol structures - */ -}; - -OMPI_DECLSPEC int mca_vprotocol_example_dump(struct ompi_communicator_t* comm, int verbose) -{ - V_OUTPUT("vprotocol_example dump for comm %d", comm->c_contextid); - return mca_pml_v.host_pml.pml_dump(comm, verbose); -} - diff --git a/ompi/mca/vprotocol/example/vprotocol_example.h b/ompi/mca/vprotocol/example/vprotocol_example.h deleted file mode 100644 index 98b119851a1..00000000000 --- a/ompi/mca/vprotocol/example/vprotocol_example.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of the University of Tennessee. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef __INCLUDE_VPROTOCOL_EXAMPLE_H__ -#define __INCLUDE_VPROTOCOL_EXAMPLE_H__ - -#include "ompi_config.h" -#include "../pml_v.h" -#include "../pml_v_protocol.h" - -#include "vprotocol_example_wait.h" -#include "ompi/communicator/communicator.h" - -typedef struct mca_vprotocol_example_module_t { - mca_pml_v_protocol_base_module_t super; -/** - * Insert here your own protocol structures - */ -} mca_vprotocol_example_module_t; - -extern mca_vprotocol_example_module_t mca_vprotocol_example; - -OMPI_DECLSPEC int mca_vprotocol_example_add_procs(struct ompi_proc_t **procs, size_t nprocs); -OMPI_DECLSPEC int mca_vprotocol_example_del_procs(struct ompi_proc_t **procs, size_t nprocs); -OMPI_DECLSPEC int mca_vprotocol_example_enable(bool enable); -OMPI_DECLSPEC int mca_vprotocol_example_progress(void); -OMPI_DECLSPEC int mca_vprotocol_example_add_comm(struct ompi_communicator_t* comm); -OMPI_DECLSPEC int mca_vprotocol_example_del_comm(struct ompi_communicator_t* comm); - -OMPI_DECLSPEC int mca_vprotocol_example_irecv_init(void *buf, - size_t count, - struct ompi_datatype_t *datatype, - int src, - int tag, - struct ompi_communicator_t* comm, - struct ompi_request_t **request ); -OMPI_DECLSPEC int mca_vprotocol_example_irecv(void *addr, - size_t count, - ompi_datatype_t * datatype, - int src, - int tag, - struct ompi_communicator_t *comm, - struct ompi_request_t **request ); -OMPI_DECLSPEC int mca_vprotocol_example_recv(void *addr, - size_t count, - ompi_datatype_t * datatype, - int src, - int tag, - struct ompi_communicator_t *comm, - ompi_status_public_t * status ); - -OMPI_DECLSPEC int mca_vprotocol_example_isend_init(void *buf, - size_t count, - struct ompi_datatype_t *datatype, - int dst, - int tag, - mca_pml_base_send_mode_t mode, - struct ompi_communicator_t* comm, - struct ompi_request_t **request ); -OMPI_DECLSPEC int mca_vprotocol_example_isend(void *buf, - size_t count, - ompi_datatype_t* datatype, - int dst, - int tag, - mca_pml_base_send_mode_t sendmode, - ompi_communicator_t* comm, - ompi_request_t** request ); -OMPI_DECLSPEC int mca_vprotocol_example_send(void *buf, - size_t count, - ompi_datatype_t* datatype, - int dst, - int tag, - mca_pml_base_send_mode_t sendmode, - ompi_communicator_t* comm ); - -OMPI_DECLSPEC int mca_vprotocol_example_iprobe(int src, int tag, - struct ompi_communicator_t *comm, - int *matched, ompi_status_public_t * status ); -OMPI_DECLSPEC int mca_vprotocol_example_probe(int src, int tag, - struct ompi_communicator_t *comm, - ompi_status_public_t * status ); - -OMPI_DECLSPEC int mca_vprotocol_example_start(size_t count, - struct ompi_request_t** requests ); - -OMPI_DECLSPEC int mca_vprotocol_example_dump(struct ompi_communicator_t* comm, - int verbose ); - -#endif /* __INCLUDE_VPROTOCOL_EXAMPLE_H__ */ diff --git a/ompi/mca/vprotocol/example/vprotocol_example_comm.c b/ompi/mca/vprotocol/example/vprotocol_example_comm.c deleted file mode 100644 index bd14319e580..00000000000 --- a/ompi/mca/vprotocol/example/vprotocol_example_comm.c +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of the University of Tennessee. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "../pml_v.h" -#include "vprotocol_example.h" - -int mca_vprotocol_example_add_comm(struct ompi_communicator_t* comm) -{ - V_OUTPUT_VERBOSE(30, "vprotocol_example_add_comm(%d)", comm->c_contextid); - return mca_pml_v.host_pml.pml_add_comm(comm); -} - -int mca_vprotocol_example_del_comm(struct ompi_communicator_t* comm) -{ - V_OUTPUT_VERBOSE(30, "vprotocol_example_del_comm(%d)", comm->c_contextid); - return mca_pml_v.host_pml.pml_del_comm(comm); -} diff --git a/ompi/mca/vprotocol/example/vprotocol_example_component.c b/ompi/mca/vprotocol/example/vprotocol_example_component.c deleted file mode 100644 index 51c36e0198d..00000000000 --- a/ompi/mca/vprotocol/example/vprotocol_example_component.c +++ /dev/null @@ -1,106 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2007 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/mca/mca.h" -#include "../pml_v.h" -#include "../pml_v_protocol_base.h" -#include "vprotocol_example.h" - -static int mca_vprotocol_example_component_open(void); -static int mca_vprotocol_example_component_close(void); - -static mca_pml_v_protocol_base_module_t *mca_vprotocol_example_component_init( int* priority, - bool, bool); -static int mca_vprotocol_example_component_finalize(void); - - -static int _priority; - - -mca_pml_v_protocol_base_component_2_0_0_t mca_vprotocol_example_component = -{ - /* First, the mca_base_component_t struct containing meta - * information about the component itself */ - .pmlm_version = { - MCA_VPROTOCOL_BASE_VERSION_2_0_0, - - .mca_component_name = "example", - MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION), - .mca_open_component = mca_vprotocol_example_component_open, - .mca_close_component = mca_vprotocol_example_component_close, - }, - .pmlm_data = { - /* component is not checkpointable */ - MCA_BASE_METADATA_PARAM_NONE - }, - - .pmlm_init = mca_vprotocol_example_component_init, - .pmlm_finalize = mca_vprotocol_example_component_finalize, -}; -MCA_BASE_COMPONENT_INIT(ompi, vprotocol, example) - -/** MCA level functions - */ - -int mca_vprotocol_example_component_open(void) -{ - _priority = mca_param_register_int( "priority", -1); - V_OUTPUT_VERBOSE(10, "vprotocol_example_open, read priority %d", _priority); - return OMPI_SUCCESS; -} - -int mca_vprotocol_example_component_close(void) -{ - V_OUTPUT_VERBOSE(10, "vprotocol_example_close"); - return OMPI_SUCCESS; -} - -/** VPROTOCOL level functions (same as PML one) - */ - -mca_pml_v_protocol_base_module_t *mca_vprotocol_example_component_init( int* priority, - bool enable_progress_threads, - bool enable_mpi_threads) -{ - V_OUTPUT_VERBOSE(10, "vprotocol_example_init"); - *priority = _priority; - -/** - * Some protocols requires sanity check about thread support (those making piecewise deterministic assumption) - if(enable_mpi_threads) - { - OPAL_OUTPUT_VERBOSE( mca_pml_v_verbose, mca_pml_v_output, "vprotocol_example.init: threads are enabled, and not supported by vprotocol example fault tolerant layer, will not load")); - return NULL; - } - */ - -/** - * Insert your own protocol initialization here - */ - - return &mca_vprotocol_example.super; -} - -int mca_vprotocol_example_component_finalize(void) -{ - V_OUTPUT_VERBOSE(10, "vprotocol_example_finalize"); - -/** - * Insert your own garbage collecting here - */ - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/vprotocol/example/vprotocol_example_probe.c b/ompi/mca/vprotocol/example/vprotocol_example_probe.c deleted file mode 100644 index 87b6de4caee..00000000000 --- a/ompi/mca/vprotocol/example/vprotocol_example_probe.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of the University of Tennessee. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "../pml_v.h" -#include "vprotocol_example.h" - -int mca_vprotocol_example_probe( int src, int tag, - struct ompi_communicator_t *comm, - ompi_status_public_t * status ) -{ - V_OUTPUT_VERBOSE(50, "mca_vprotocol_example_probe(%d, %d, %d)", src, tag, comm->c_contextid); - return mca_pml_v.host_pml.pml_probe(src, tag, comm, status); -} - -int mca_vprotocol_example_iprobe( int src, int tag, - struct ompi_communicator_t *comm, - int *matched, ompi_status_public_t * status ) -{ - V_OUTPUT_VERBOSE(60, "mca_vprotocol_example_iprobe(%d, %d, %d)", src, tag, comm->c_contextid); - return mca_pml_v.host_pml.pml_iprobe(src, tag, comm, matched, status); -} diff --git a/ompi/mca/vprotocol/example/vprotocol_example_proc.c b/ompi/mca/vprotocol/example/vprotocol_example_proc.c deleted file mode 100644 index 9ca77db948c..00000000000 --- a/ompi/mca/vprotocol/example/vprotocol_example_proc.c +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of the University of Tennessee. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "../pml_v.h" -#include "vprotocol_example.h" - -int mca_vprotocol_example_add_procs(struct ompi_proc_t **procs, size_t nprocs) -{ - V_OUTPUT_VERBOSE(30, "adding %ld procs", (long) nprocs); - return mca_pml_v.host_pml.pml_add_procs(procs, nprocs); -} - -int mca_vprotocol_example_del_procs(struct ompi_proc_t **procs, size_t nprocs) -{ - V_OUTPUT_VERBOSE(30, "removing %ld procs", (long) nprocs); - return mca_pml_v.host_pml.pml_del_procs(procs, nprocs); -} diff --git a/ompi/mca/vprotocol/example/vprotocol_example_progress.c b/ompi/mca/vprotocol/example/vprotocol_example_progress.c deleted file mode 100644 index 1081bd961aa..00000000000 --- a/ompi/mca/vprotocol/example/vprotocol_example_progress.c +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of the University of Tennessee. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "../pml_v.h" -#include "vprotocol_example.h" - -int mca_vprotocol_example_enable(bool enable) -{ - V_OUTPUT_VERBOSE(15, "enable=%d", enable); - return mca_pml_v.host_pml.pml_enable(enable); -} - -int mca_vprotocol_example_progress(void) -{ - V_OUTPUT_VERBOSE(100, "progress..."); - return mca_pml_v.host_pml.pml_progress(); -} diff --git a/ompi/mca/vprotocol/example/vprotocol_example_recv.c b/ompi/mca/vprotocol/example/vprotocol_example_recv.c deleted file mode 100644 index dbaaa9c88cf..00000000000 --- a/ompi/mca/vprotocol/example/vprotocol_example_recv.c +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2010 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "../pml_v.h" -#include "vprotocol_example.h" - -int mca_vprotocol_example_irecv_init(void *addr, - size_t count, - struct ompi_datatype_t *datatype, - int src, - int tag, - struct ompi_communicator_t* comm, - struct ompi_request_t **request ) -{ - int ret; - - ret = mca_pml_v.host_pml.pml_irecv_init(addr, count, datatype, src, tag, comm, request); - V_OUTPUT_VERBOSE(50, "posted\tirecv_init %ld\tcomm %d\tfrom %d\ttag %d\tsize %ld", ((mca_pml_base_request_t *)*request)->req_sequence, comm->c_contextid, src, tag, (long) count); - return ret; -} - -int mca_vprotocol_example_irecv(void *addr, - size_t count, - ompi_datatype_t * datatype, - int src, - int tag, - struct ompi_communicator_t *comm, - struct ompi_request_t **request) -{ - int ret; - - ret = mca_pml_v.host_pml.pml_irecv(addr, count, datatype, src, tag, comm, request); - V_OUTPUT_VERBOSE(50, "posted\tirecv %ld\tcomm %d\tfrom %d\ttag %d\tsize %ld", ((mca_pml_base_request_t *)*request)->req_sequence, comm->c_contextid, src, tag, (long) count); - return ret; -} - -int mca_vprotocol_example_recv(void *addr, - size_t count, - ompi_datatype_t * datatype, - int src, - int tag, - struct ompi_communicator_t *comm, - ompi_status_public_t * status ) -{ - int ret; - V_OUTPUT_VERBOSE(50, "posted\trecv \tcomm %d\tfrom %d\ttag %d\tsize %ld", comm->c_contextid, src, tag, (long) count); - ret = mca_pml_v.host_pml.pml_recv(addr, count, datatype, src, tag, comm, status); -# if OPAL_ENABLE_DEBUG - if(status) - V_OUTPUT_VERBOSE(75, "deliver\trecv \tcomm %d\tfrom %d(%d)\ttag %d(%d)\tsize %ld(%ld)\tstatus %d", comm->c_contextid, src, status->MPI_SOURCE, tag, status->MPI_TAG, (long) count, (long) status->_ucount, status->MPI_ERROR); - else - V_OUTPUT_VERBOSE(75, "deliver\trecv \tcomm %d\tfrom %d\ttag %d\tsize %ld", comm->c_contextid, src, tag, (long) count); -# endif - return ret; -} diff --git a/ompi/mca/vprotocol/example/vprotocol_example_send.c b/ompi/mca/vprotocol/example/vprotocol_example_send.c deleted file mode 100644 index 28540e8bc58..00000000000 --- a/ompi/mca/vprotocol/example/vprotocol_example_send.c +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of the University of Tennessee. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "../pml_v.h" -#include "vprotocol_example.h" - -int mca_vprotocol_example_isend_init(void *addr, - size_t count, - struct ompi_datatype_t *datatype, - int dst, - int tag, - mca_pml_base_send_mode_t sendmode, - struct ompi_communicator_t* comm, - struct ompi_request_t **request ) -{ - V_OUTPUT_VERBOSE(50, "request\tpisend \tcomm %d\tto %d\ttag %d\tsize %ld", comm->c_contextid, dst, tag, (long) count); - return mca_pml_v.host_pml.pml_isend_init(addr, count, datatype, dst, tag, sendmode, comm, request); -} - -int mca_vprotocol_example_isend(void *addr, - size_t count, - ompi_datatype_t * datatype, - int dst, - int tag, - mca_pml_base_send_mode_t sendmode, - struct ompi_communicator_t *comm, - struct ompi_request_t **request) -{ - V_OUTPUT_VERBOSE(50, "request\tisend \tcomm %d\tto %d\ttag %d\tsize %ld", comm->c_contextid, dst, tag, (long) count); - return mca_pml_v.host_pml.pml_isend(addr, count, datatype, dst, tag, sendmode, comm, request); -} - -int mca_vprotocol_example_send(void *addr, - size_t count, - ompi_datatype_t * datatype, - int dst, - int tag, - mca_pml_base_send_mode_t sendmode, - struct ompi_communicator_t *comm) -{ - V_OUTPUT_VERBOSE(50, "request\tsend \tcomm %d\tto %d\ttag %d\tsize %ld", comm->c_contextid, dst, tag, (long) count); - return mca_pml_v.host_pml.pml_send(addr, count, datatype, dst, tag, sendmode, comm); -} diff --git a/ompi/mca/vprotocol/example/vprotocol_example_start.c b/ompi/mca/vprotocol/example/vprotocol_example_start.c deleted file mode 100644 index e9a6d25d2d5..00000000000 --- a/ompi/mca/vprotocol/example/vprotocol_example_start.c +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of the University of Tennessee. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "vprotocol_example.h" -#include "vprotocol_example_start.h" - -OMPI_DECLSPEC int mca_vprotocol_example_start(size_t count, ompi_request_t **requests) -{ - V_OUTPUT_VERBOSE(50, "starting %ld requests", (long) count); - return mca_pml_v.host_pml.pml_start(count, requests); -} diff --git a/ompi/mca/vprotocol/example/vprotocol_example_start.h b/ompi/mca/vprotocol/example/vprotocol_example_start.h deleted file mode 100644 index 91d56ee16db..00000000000 --- a/ompi/mca/vprotocol/example/vprotocol_example_start.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of the University of Tennessee. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef __VPROTOCOL_EXAMPLE_START_H__ -#define __VPROTOCOL_EXAMPLE_START_H__ - -#include "ompi_config.h" -#include "vprotocol_example.h" - -#endif /* __VPROTOCOL_EXAMPLE_START_H__ */ - -OMPI_DECLSPEC int mca_vprotocol_example_start(size_t count, ompi_request_t **requests); diff --git a/ompi/mca/vprotocol/example/vprotocol_example_wait.c b/ompi/mca/vprotocol/example/vprotocol_example_wait.c deleted file mode 100644 index a74ce7dfeaf..00000000000 --- a/ompi/mca/vprotocol/example/vprotocol_example_wait.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of the University of Tennessee. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "vprotocol_example.h" -#include "vprotocol_example_wait.h" - - -int mca_vprotocol_example_wait_any(size_t count, ompi_request_t ** requests, int *index, ompi_status_public_t * status) -{ - return OMPI_ERROR; -} - - -int mca_vprotocol_example_wait_some(size_t count, ompi_request_t ** requests, int *indexes, ompi_status_public_t * statuses) -{ - return mca_vprotocol_example_wait_any(count, requests, indexes, statuses); -} - - -int mca_vprotocol_example_wait_all(size_t count, ompi_request_t ** requests, ompi_status_public_t * statuses) -{ - return OMPI_ERROR; -} diff --git a/ompi/mca/vprotocol/example/vprotocol_example_wait.h b/ompi/mca/vprotocol/example/vprotocol_example_wait.h deleted file mode 100644 index 1c20047eafd..00000000000 --- a/ompi/mca/vprotocol/example/vprotocol_example_wait.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of the University of Tennessee. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef __VPROTOCOL_EXAMPLE_WAIT_H__ -#define __VPROTOCOL_EXAMPLE_WAIT_H__ - -#include "ompi_config.h" -#include "vprotocol_example.h" - -OMPI_DECLSPEC int mca_vprotocol_example_wait_any(size_t count, ompi_request_t ** requests, int *index, ompi_status_public_t * status); -OMPI_DECLSPEC int mca_vprotocol_example_wait_some(size_t count, ompi_request_t ** requests, int *indexes, ompi_status_public_t * statuses); -OMPI_DECLSPEC int mca_vprotocol_example_wait_all(size_t count, ompi_request_t ** requests, ompi_status_public_t * statuses); - -#endif /* __VPROTOCOL_EXAMPLE_WAIT_H__ */ From 04360ccfd26f0f1be09d98912924be263d7e977c Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Thu, 27 Nov 2025 10:03:57 -0500 Subject: [PATCH 27/51] coll/demo: Updates to support const, bigcount Signed-off-by: Jeff Squyres --- ompi/mca/coll/demo/coll_demo.h | 33 ++++++++++------------- ompi/mca/coll/demo/coll_demo_allgatherv.c | 4 +-- ompi/mca/coll/demo/coll_demo_component.c | 2 +- ompi/mca/coll/demo/coll_demo_gatherv.c | 4 +-- ompi/mca/coll/demo/coll_demo_scan.c | 2 +- ompi/mca/coll/demo/coll_demo_scatterv.c | 4 +-- 6 files changed, 22 insertions(+), 27 deletions(-) diff --git a/ompi/mca/coll/demo/coll_demo.h b/ompi/mca/coll/demo/coll_demo.h index 925c41d5e82..ab39200407b 100644 --- a/ompi/mca/coll/demo/coll_demo.h +++ b/ompi/mca/coll/demo/coll_demo.h @@ -64,7 +64,7 @@ BEGIN_C_DECLS struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module); - int mca_coll_demo_allgatherv_inter(const void *sbuf, int scount, + int mca_coll_demo_allgatherv_inter(const void *sbuf, size_t scount, struct ompi_datatype_t *sdtype, void * rbuf, ompi_count_array_t rcounts, ompi_disp_array_t disps, struct ompi_datatype_t *rdtype, @@ -109,15 +109,15 @@ BEGIN_C_DECLS mca_coll_base_module_t *module); int mca_coll_demo_alltoallw_intra(const void *sbuf, ompi_count_array_t scounts, ompi_disp_array_t sdisps, - struct ompi_datatype_t **sdtypes, + struct ompi_datatype_t *const *sdtypes, void *rbuf, ompi_count_array_t rcounts, ompi_disp_array_t rdisps, - struct ompi_datatype_t **rdtypes, + struct ompi_datatype_t *const *rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module); - int mca_coll_demo_alltoallw_inter(void *sbuf, ompi_count_array_t scounts, ompi_disp_array_t sdisps, - struct ompi_datatype_t **sdtypes, + int mca_coll_demo_alltoallw_inter(const void *sbuf, ompi_count_array_t scounts, ompi_disp_array_t sdisps, + struct ompi_datatype_t *const *sdtypes, void *rbuf, ompi_count_array_t rcounts, ompi_disp_array_t rdisps, - struct ompi_datatype_t **rdtypes, + struct ompi_datatype_t *const *rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module); @@ -159,15 +159,15 @@ BEGIN_C_DECLS int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module); - int mca_coll_demo_gatherv_intra(const void *sbuf, int scount, + int mca_coll_demo_gatherv_intra(const void *sbuf, size_t scount, struct ompi_datatype_t *sdtype, void *rbuf, ompi_count_array_t rcounts, ompi_disp_array_t disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module); - int mca_coll_demo_gatherv_inter(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, void *rbuf, - ompi_count_array_t rcounts, ompi_disp_array_t disps, + int mca_coll_demo_gatherv_inter(const void *sbuf, size_t scount, + struct ompi_datatype_t *sdtype, + void *rbuf, ompi_count_array_t rcounts, ompi_disp_array_t disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module); @@ -199,15 +199,10 @@ BEGIN_C_DECLS mca_coll_base_module_t *module); int mca_coll_demo_scan_intra(const void *sbuf, void *rbuf, size_t count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - int mca_coll_demo_scan_inter(const void *sbuf, void *rbuf, size_t count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); int mca_coll_demo_scatter_intra(const void *sbuf, size_t scount, struct ompi_datatype_t *sdtype, void *rbuf, diff --git a/ompi/mca/coll/demo/coll_demo_allgatherv.c b/ompi/mca/coll/demo/coll_demo_allgatherv.c index 283ccc6522a..390e1bbc14e 100644 --- a/ompi/mca/coll/demo/coll_demo_allgatherv.c +++ b/ompi/mca/coll/demo/coll_demo_allgatherv.c @@ -34,7 +34,7 @@ * Accepts: - same as MPI_Allgatherv() * Returns: - MPI_SUCCESS or error code */ -int mca_coll_demo_allgatherv_intra(const void *sbuf, int scount, +int mca_coll_demo_allgatherv_intra(const void *sbuf, size_t scount, struct ompi_datatype_t *sdtype, void * rbuf, ompi_count_array_t rcounts, ompi_disp_array_t disps, struct ompi_datatype_t *rdtype, @@ -57,7 +57,7 @@ int mca_coll_demo_allgatherv_intra(const void *sbuf, int scount, * Accepts: - same as MPI_Allgatherv() * Returns: - MPI_SUCCESS or error code */ -int mca_coll_demo_allgatherv_inter(const void *sbuf, int scount, +int mca_coll_demo_allgatherv_inter(const void *sbuf, size_t scount, struct ompi_datatype_t *sdtype, void * rbuf, ompi_count_array_t rcounts, ompi_disp_array_t disps, struct ompi_datatype_t *rdtype, diff --git a/ompi/mca/coll/demo/coll_demo_component.c b/ompi/mca/coll/demo/coll_demo_component.c index b7c96cdcae4..3ff6ce5d891 100644 --- a/ompi/mca/coll/demo/coll_demo_component.c +++ b/ompi/mca/coll/demo/coll_demo_component.c @@ -62,7 +62,7 @@ const mca_coll_base_component_3_0_0_t mca_coll_demo_component = { about the component itself */ .collm_version = { - MCA_COLL_BASE_VERSION_2_4_0, + MCA_COLL_BASE_VERSION_3_0_0, /* Component name and version */ .mca_component_name = "demo", diff --git a/ompi/mca/coll/demo/coll_demo_gatherv.c b/ompi/mca/coll/demo/coll_demo_gatherv.c index cc34d01b15b..8de0f8c7f5d 100644 --- a/ompi/mca/coll/demo/coll_demo_gatherv.c +++ b/ompi/mca/coll/demo/coll_demo_gatherv.c @@ -34,7 +34,7 @@ * Accepts: - same arguments as MPI_Gatherv() * Returns: - MPI_SUCCESS or error code */ -int mca_coll_demo_gatherv_intra(const void *sbuf, int scount, +int mca_coll_demo_gatherv_intra(const void *sbuf, size_t scount, struct ompi_datatype_t *sdtype, void *rbuf, ompi_count_array_t rcounts, ompi_disp_array_t disps, struct ompi_datatype_t *rdtype, int root, @@ -57,7 +57,7 @@ int mca_coll_demo_gatherv_intra(const void *sbuf, int scount, * Accepts: - same arguments as MPI_Gatherv() * Returns: - MPI_SUCCESS or error code */ -int mca_coll_demo_gatherv_inter(const void *sbuf, int scount, +int mca_coll_demo_gatherv_inter(const void *sbuf, size_t scount, struct ompi_datatype_t *sdtype, void *rbuf, ompi_count_array_t rcounts, ompi_disp_array_t disps, struct ompi_datatype_t *rdtype, int root, diff --git a/ompi/mca/coll/demo/coll_demo_scan.c b/ompi/mca/coll/demo/coll_demo_scan.c index 030b8827ced..dd6ee88d68f 100644 --- a/ompi/mca/coll/demo/coll_demo_scan.c +++ b/ompi/mca/coll/demo/coll_demo_scan.c @@ -34,7 +34,7 @@ * Accepts: - same arguments as MPI_Scan() * Returns: - MPI_SUCCESS or error code */ -int mca_coll_demo_scan_intra(void *sbuf, void *rbuf, size_t count, +int mca_coll_demo_scan_intra(const void *sbuf, void *rbuf, size_t count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, diff --git a/ompi/mca/coll/demo/coll_demo_scatterv.c b/ompi/mca/coll/demo/coll_demo_scatterv.c index 11e42905df5..5aceff7e779 100644 --- a/ompi/mca/coll/demo/coll_demo_scatterv.c +++ b/ompi/mca/coll/demo/coll_demo_scatterv.c @@ -36,7 +36,7 @@ */ int mca_coll_demo_scatterv_intra(const void *sbuf, ompi_count_array_t scounts, ompi_disp_array_t disps, struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, + void *rbuf, size_t rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) @@ -59,7 +59,7 @@ int mca_coll_demo_scatterv_intra(const void *sbuf, ompi_count_array_t scounts, */ int mca_coll_demo_scatterv_inter(const void *sbuf, ompi_count_array_t scounts, ompi_disp_array_t disps, struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, + void *rbuf, size_t rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) From 4dea0fea8852157d72864d21a1c14d0f5ac13e62 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Thu, 27 Nov 2025 10:04:44 -0500 Subject: [PATCH 28/51] topo/example: updates for const and API changes Signed-off-by: Jeff Squyres --- ompi/mca/topo/example/topo_example.h | 9 +++++---- ompi/mca/topo/example/topo_example_cart_map.c | 6 +++--- ompi/mca/topo/example/topo_example_component.c | 9 +++------ ompi/mca/topo/example/topo_example_graph_map.c | 4 ++-- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/ompi/mca/topo/example/topo_example.h b/ompi/mca/topo/example/topo_example.h index 8e2b2903d7d..cf810602605 100644 --- a/ompi/mca/topo/example/topo_example.h +++ b/ompi/mca/topo/example/topo_example.h @@ -66,14 +66,15 @@ OBJ_CLASS_DECLARATION(mca_topo_example_module_t); int mca_topo_example_cart_map(struct ompi_communicator_t *comm, int ndims, - int *dims, - int *periods, + const int *dims, + const int *periods, int *newrank); + int mca_topo_example_graph_map(struct ompi_communicator_t *comm, int nnodes, - int *index, - int *edges, + const int *index, + const int *edges, int *newrank); /* * ****************************************************************** diff --git a/ompi/mca/topo/example/topo_example_cart_map.c b/ompi/mca/topo/example/topo_example_cart_map.c index 4c78a75348b..05eaf4e4215 100644 --- a/ompi/mca/topo/example/topo_example_cart_map.c +++ b/ompi/mca/topo/example/topo_example_cart_map.c @@ -41,15 +41,15 @@ int mca_topo_example_cart_map (ompi_communicator_t* comm, int ndims, - int *dims, - int *periods, + const int *dims, + const int *periods, int *newrank) { int nprocs; int rank; int size; int i; - int *p; + const int *p; /* * Compute the # of processes in the grid. diff --git a/ompi/mca/topo/example/topo_example_component.c b/ompi/mca/topo/example/topo_example_component.c index 07d222dd6a4..ce51fa78f1f 100644 --- a/ompi/mca/topo/example/topo_example_component.c +++ b/ompi/mca/topo/example/topo_example_component.c @@ -34,8 +34,7 @@ const char *mca_topo_example_component_version_string = * Local functions */ static int init_query(bool enable_progress_threads, bool enable_mpi_threads); -static struct mca_topo_base_module_t * -comm_query(const ompi_communicator_t *comm, int *priority, uint32_t type); +static struct mca_topo_base_module_t *query(const ompi_communicator_t *comm, const ompi_group_t *group, int *priority, uint32_t type); /* * Public component structure @@ -57,7 +56,7 @@ mca_topo_base_component_2_2_0_t mca_topo_example_component = }, .topoc_init_query = init_query, - .topoc_comm_query = comm_query, + .topoc_query = query, }; MCA_BASE_COMPONENT_INIT(ompi, topo, example) @@ -71,7 +70,7 @@ static int init_query(bool enable_progress_threads, bool enable_mpi_threads) static struct mca_topo_base_module_t * -comm_query(const ompi_communicator_t *comm, int *priority, uint32_t type) +query(const ompi_communicator_t *comm, const ompi_group_t *group, int *priority, uint32_t type) { mca_topo_example_module_t *example = OBJ_NEW(mca_topo_example_module_t); if (NULL == example) { @@ -88,5 +87,3 @@ comm_query(const ompi_communicator_t *comm, int *priority, uint32_t type) example->super.type = type; return &(example->super); } - - diff --git a/ompi/mca/topo/example/topo_example_graph_map.c b/ompi/mca/topo/example/topo_example_graph_map.c index e2fa095babd..08341bf471c 100644 --- a/ompi/mca/topo/example/topo_example_graph_map.c +++ b/ompi/mca/topo/example/topo_example_graph_map.c @@ -40,8 +40,8 @@ int mca_topo_example_graph_map (ompi_communicator_t* comm, int nnodes, - int *index, - int *edges, + const int *index, + const int *edges, int *newrank) { int myrank; From f2f5d1436188d79d6b7ea80185a168e1990f1570 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 29 Nov 2025 06:56:34 -0500 Subject: [PATCH 29/51] Github Actions: Add Action to build ignored components Signed-off-by: Jeff Squyres --- .github/workflows/compile-examples.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 .github/workflows/compile-examples.yaml diff --git a/.github/workflows/compile-examples.yaml b/.github/workflows/compile-examples.yaml new file mode 100644 index 00000000000..d03421c5530 --- /dev/null +++ b/.github/workflows/compile-examples.yaml @@ -0,0 +1,19 @@ +name: Compile ignored components + +on: [pull_request] + +jobs: + compile-ignored: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - name: Remove .opal_ignore files so that we build all examples + run: | + find . -name .opal_ignore -exec rm -f {} \; -print + - name: Build Open MPI + run: | + ./autogen.pl + ./configure --prefix=${PWD}/install --disable-silent-rules + make -j From 8e3823a3134e9daaf91116217a789d7d8f2a6c1d Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Mon, 1 Dec 2025 11:30:33 -0700 Subject: [PATCH 30/51] fortran i8: sqaush some more compiler warnings that show up when trying to build fortran bindings with integer(kind=8) default. Signed-off-by: Howard Pritchard --- ompi/mpi/fortran/mpif-h/get_elements_x_f.c | 1 - ompi/mpi/fortran/mpif-h/remove_error_class_f.c | 5 ----- ompi/mpi/fortran/mpif-h/remove_error_code_f.c | 5 ----- ompi/mpi/fortran/mpif-h/type_size_x_f.c | 1 - 4 files changed, 12 deletions(-) diff --git a/ompi/mpi/fortran/mpif-h/get_elements_x_f.c b/ompi/mpi/fortran/mpif-h/get_elements_x_f.c index 10480b6c7a4..044b2f200d5 100644 --- a/ompi/mpi/fortran/mpif-h/get_elements_x_f.c +++ b/ompi/mpi/fortran/mpif-h/get_elements_x_f.c @@ -74,7 +74,6 @@ void ompi_get_elements_x_f(MPI_Fint *status, MPI_Fint *datatype, MPI_Count *coun int c_ierr; MPI_Datatype c_type = PMPI_Type_f2c(*datatype); MPI_Status c_status; - OMPI_SINGLE_NAME_DECL(count); if (OMPI_IS_FORTRAN_STATUS_IGNORE(status)) { *count = OMPI_INT_2_FINT(0); diff --git a/ompi/mpi/fortran/mpif-h/remove_error_class_f.c b/ompi/mpi/fortran/mpif-h/remove_error_class_f.c index 478ddd16504..2eeb4ad2d09 100644 --- a/ompi/mpi/fortran/mpif-h/remove_error_class_f.c +++ b/ompi/mpi/fortran/mpif-h/remove_error_class_f.c @@ -70,12 +70,7 @@ OMPI_GENERATE_F77_BINDINGS (MPI_REMOVE_ERROR_CLASS, void ompi_remove_error_class_f(MPI_Fint *errorclass, MPI_Fint *ierr) { int ierr_c; - OMPI_SINGLE_NAME_DECL(errorclass); ierr_c = PMPI_Remove_error_class(OMPI_FINT_2_INT(*errorclass)); if (NULL != ierr) *ierr = OMPI_INT_2_FINT(ierr_c); - - if (MPI_SUCCESS == ierr_c) { - OMPI_SINGLE_INT_2_FINT(errorclass); - } } diff --git a/ompi/mpi/fortran/mpif-h/remove_error_code_f.c b/ompi/mpi/fortran/mpif-h/remove_error_code_f.c index 1cd6536e461..f9889cb6d64 100644 --- a/ompi/mpi/fortran/mpif-h/remove_error_code_f.c +++ b/ompi/mpi/fortran/mpif-h/remove_error_code_f.c @@ -70,12 +70,7 @@ OMPI_GENERATE_F77_BINDINGS (MPI_REMOVE_ERROR_CODE, void ompi_remove_error_code_f(MPI_Fint *errorcode, MPI_Fint *ierr) { int ierr_c; - OMPI_SINGLE_NAME_DECL(errorcode); ierr_c = PMPI_Remove_error_code(OMPI_FINT_2_INT(*errorcode)); - if (NULL != ierr) *ierr = OMPI_INT_2_FINT(ierr_c); - if (MPI_SUCCESS == ierr_c) { - OMPI_SINGLE_INT_2_FINT(errorcode); - } } diff --git a/ompi/mpi/fortran/mpif-h/type_size_x_f.c b/ompi/mpi/fortran/mpif-h/type_size_x_f.c index 2527f49878f..eff61245d16 100644 --- a/ompi/mpi/fortran/mpif-h/type_size_x_f.c +++ b/ompi/mpi/fortran/mpif-h/type_size_x_f.c @@ -72,7 +72,6 @@ void ompi_type_size_x_f(MPI_Fint *type, MPI_Count *size, MPI_Fint *ierr) { int c_ierr; MPI_Datatype c_type = PMPI_Type_f2c(*type); - OMPI_SINGLE_NAME_DECL(size); c_ierr = PMPI_Type_size_x(c_type, size); if (NULL != ierr) *ierr = OMPI_INT_2_FINT(c_ierr); From 64e86bd30c341ee8c2aa9505bc9a7c22eeafdbff Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Mon, 1 Dec 2025 11:34:38 -0700 Subject: [PATCH 31/51] configure: update ubcl with newer help macro with recent autoconf tool chain autogen.pl emits a warning mess when processing the ompi_check_ubcl.m4 file: autoreconf: running: /opt/homebrew/Cellar/autoconf/2.71/bin/autoconf --include=config --include=config/oac --force configure.ac:1209: warning: The macro `AC_HELP_STRING' is obsolete. configure.ac:1209: You should run autoupdate. ./lib/autoconf/general.m4:204: AC_HELP_STRING is expanded from... ./lib/autoconf/general.m4:1553: AC_ARG_WITH is expanded from... config/ompi_check_ubcl.m4:24: OMPI_CHECK_UBCL is expanded from... opal/mca/common/ubcl/configure.m4:11: MCA_opal_common_ubcl_CONFIG is expanded from... config/opal_mca.m4:597: MCA_CONFIGURE_M4_CONFIG_COMPONENT is expanded from... config/opal_mca.m4:376: MCA_CONFIGURE_FRAMEWORK is expanded from... config/opal_mca.m4:268: MCA_CONFIGURE_PROJECT is expanded from... config/opal_mca.m4:42: OPAL_MCA is expanded from... This patch fixes that. Signed-off-by: Howard Pritchard --- config/ompi_check_ubcl.m4 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/ompi_check_ubcl.m4 b/config/ompi_check_ubcl.m4 index d44a6e4b6cc..0957a4fe2ed 100644 --- a/config/ompi_check_ubcl.m4 +++ b/config/ompi_check_ubcl.m4 @@ -27,7 +27,7 @@ AC_DEFUN([OMPI_CHECK_UBCL],[ m4_ifblank([$1], [m4_fatal([First argument to OMPI_CHECK_UBCL cannot be blank])]) AC_ARG_WITH([ubcl], - [AC_HELP_STRING([--with-ubcl(=DIR)], + [AS_HELP_STRING([--with-ubcl(=DIR)], [Build with UBCL support])]) # UBCL is dlopen'd to avoid direct link to libubcl.so. From 3980e803c328c789bcbfbd891323de92967f596a Mon Sep 17 00:00:00 2001 From: Qiao Kang Date: Mon, 1 Dec 2025 12:47:32 -0600 Subject: [PATCH 32/51] coll/ucc: Fix indentation issue with tab. Signed-off-by: Qiao Kang --- ompi/mca/coll/ucc/coll_ucc_module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ompi/mca/coll/ucc/coll_ucc_module.c b/ompi/mca/coll/ucc/coll_ucc_module.c index dfa3c1cf96c..028382df344 100644 --- a/ompi/mca/coll/ucc/coll_ucc_module.c +++ b/ompi/mca/coll/ucc/coll_ucc_module.c @@ -207,9 +207,9 @@ static ucc_status_t oob_allgather_test(void *req) tmpsend = (char*)oob_req->rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)msglen; rc = MCA_PML_CALL(isend(tmpsend, msglen, MPI_BYTE, sendto, MCA_COLL_BASE_TAG_UCC, MCA_PML_BASE_SEND_STANDARD, comm, &oob_req->reqs[0])); - if (OMPI_SUCCESS != rc) { + if (OMPI_SUCCESS != rc) { return UCC_ERR_NO_MESSAGE; - } + } rc = MCA_PML_CALL(irecv(tmprecv, msglen, MPI_BYTE, recvfrom, MCA_COLL_BASE_TAG_UCC, comm, &oob_req->reqs[1])); if (OMPI_SUCCESS != rc) { From 9531590937292939a33acdb6262e0e0c118d0b91 Mon Sep 17 00:00:00 2001 From: Qiao Kang Date: Tue, 2 Dec 2025 09:35:35 -0600 Subject: [PATCH 33/51] coll/ucc: Fix indentation issue with tab. Signed-off-by: Qiao Kang --- ompi/mca/coll/ucc/coll_ucc_module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ompi/mca/coll/ucc/coll_ucc_module.c b/ompi/mca/coll/ucc/coll_ucc_module.c index 028382df344..aa1dee91c24 100644 --- a/ompi/mca/coll/ucc/coll_ucc_module.c +++ b/ompi/mca/coll/ucc/coll_ucc_module.c @@ -212,9 +212,9 @@ static ucc_status_t oob_allgather_test(void *req) } rc = MCA_PML_CALL(irecv(tmprecv, msglen, MPI_BYTE, recvfrom, MCA_COLL_BASE_TAG_UCC, comm, &oob_req->reqs[1])); - if (OMPI_SUCCESS != rc) { + if (OMPI_SUCCESS != rc) { return UCC_ERR_NO_MESSAGE; - } + } } probe = 0; do { From 6519bd042412c1c354dcb38ba61e9e29e0631216 Mon Sep 17 00:00:00 2001 From: Joseph Antony Date: Wed, 19 Nov 2025 12:29:14 -0500 Subject: [PATCH 34/51] Public APIs for: put_with_notify get_with_notify Signed-off-by: Joseph Antony --- ompi/include/mpi.h.in | 29 ++++++++ ompi/include/mpif-values.py | 1 + ompi/mca/osc/osc.h | 16 ++--- ompi/mca/osc/sm/osc_sm.h | 10 +-- ompi/mca/osc/sm/osc_sm_comm.c | 8 +-- ompi/mca/osc/sm/osc_sm_component.c | 8 +-- ompi/mpi/bindings/ompi_bindings/consts.py | 1 + ompi/mpi/c/Makefile.am | 2 + ompi/mpi/c/get_notify.c.in | 77 ++++++++++++++++++++++ ompi/mpi/c/put_notify.c.in | 80 +++++++++++++++++++++++ ompi/runtime/ompi_spc.c | 2 + ompi/runtime/ompi_spc.h | 2 + 12 files changed, 215 insertions(+), 21 deletions(-) create mode 100644 ompi/mpi/c/get_notify.c.in create mode 100644 ompi/mpi/c/put_notify.c.in diff --git a/ompi/include/mpi.h.in b/ompi/include/mpi.h.in index 5730d67ef8a..73542a9ed5c 100644 --- a/ompi/include/mpi.h.in +++ b/ompi/include/mpi.h.in @@ -764,6 +764,7 @@ enum { #define MPI_ERR_SESSION 78 #define MPI_ERR_VALUE_TOO_LARGE 79 #define MPI_ERR_ERRHANDLER 80 +#define MPI_ERR_NOTIFY_IDX 81 /* Per MPI-3 p349 47, MPI_ERR_LASTCODE must be >= the last predefined MPI_ERR_ code. Set the last code to allow some room for adding @@ -1917,6 +1918,14 @@ OMPI_DECLSPEC int MPI_Get_c(void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, MPI_Count target_count, MPI_Datatype target_datatype, MPI_Win win); +OMPI_DECLSPEC int MPI_Get_notify(void *origin_addr, int origin_count, + MPI_Datatype origin_datatype, int target_rank, + MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); +OMPI_DECLSPEC int MPI_Get_notify_c(void *origin_addr, MPI_Count origin_count, + MPI_Datatype origin_datatype, int target_rank, + MPI_Aint target_disp, MPI_Count target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); OMPI_DECLSPEC int MPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, @@ -2180,6 +2189,12 @@ OMPI_DECLSPEC int MPI_Put(const void *origin_addr, int origin_count, MPI_Dataty OMPI_DECLSPEC int MPI_Put_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, MPI_Count target_count, MPI_Datatype target_datatype, MPI_Win win); +OMPI_DECLSPEC int MPI_Put_notify(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); +OMPI_DECLSPEC int MPI_Put_notify_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, MPI_Count target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); OMPI_DECLSPEC int MPI_Query_thread(int *provided); OMPI_DECLSPEC int MPI_Raccumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, @@ -3093,6 +3108,14 @@ OMPI_DECLSPEC int PMPI_Get_c(void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, MPI_Count target_count, MPI_Datatype target_datatype, MPI_Win win); +OMPI_DECLSPEC int PMPI_Get_notify(void *origin_addr, int origin_count, + MPI_Datatype origin_datatype, int target_rank, + MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); +OMPI_DECLSPEC int PMPI_Get_notify_c(void *origin_addr, MPI_Count origin_count, + MPI_Datatype origin_datatype, int target_rank, + MPI_Aint target_disp, MPI_Count target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); OMPI_DECLSPEC int PMPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, @@ -3356,6 +3379,12 @@ OMPI_DECLSPEC int PMPI_Put(const void *origin_addr, int origin_count, MPI_Datat OMPI_DECLSPEC int PMPI_Put_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, MPI_Count target_count, MPI_Datatype target_datatype, MPI_Win win); +OMPI_DECLSPEC int PMPI_Put_notify(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); +OMPI_DECLSPEC int PMPI_Put_notify_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, MPI_Count target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); OMPI_DECLSPEC int PMPI_Query_thread(int *provided); OMPI_DECLSPEC int PMPI_Raccumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, diff --git a/ompi/include/mpif-values.py b/ompi/include/mpif-values.py index 53159d5d8dd..b74fbcbaf1f 100755 --- a/ompi/include/mpif-values.py +++ b/ompi/include/mpif-values.py @@ -301,6 +301,7 @@ 'MPI_ERR_SESSION': 78, 'MPI_ERR_VALUE_TOO_LARGE': 79, 'MPI_ERR_ERRHANDLER': 80, + 'MPI_ERR_NOTIFY_IDX': 81, 'MPI_ERR_LASTCODE': 92, 'MPI_IDENT': 0, 'MPI_CONGRUENT': 1, diff --git a/ompi/mca/osc/osc.h b/ompi/mca/osc/osc.h index bd05a6f11b7..83c7af9305e 100644 --- a/ompi/mca/osc/osc.h +++ b/ompi/mca/osc/osc.h @@ -216,7 +216,7 @@ typedef int (*ompi_osc_base_module_put_fn_t)(const void *origin_addr, struct ompi_datatype_t *target_dt, struct ompi_win_t *win); -typedef int (*ompi_osc_base_module_put_with_notify_fn_t)(const void *origin_addr, +typedef int (*ompi_osc_base_module_put_notify_fn_t)(const void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, int target, @@ -235,7 +235,7 @@ typedef int (*ompi_osc_base_module_get_fn_t)(void *origin_addr, struct ompi_datatype_t *target_dt, struct ompi_win_t *win); -typedef int (*ompi_osc_base_module_get_with_notify_fn_t)(void *origin_addr, +typedef int (*ompi_osc_base_module_get_notify_fn_t)(void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, int target, @@ -294,7 +294,7 @@ typedef int (*ompi_osc_base_module_rput_fn_t)(const void *origin_addr, struct ompi_win_t *win, struct ompi_request_t **request); -typedef int (*ompi_osc_base_module_rput_with_notify_fn_t)(const void *origin_addr, +typedef int (*ompi_osc_base_module_rput_notify_fn_t)(const void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, int target, @@ -315,7 +315,7 @@ typedef int (*ompi_osc_base_module_rget_fn_t)(void *origin_addr, struct ompi_win_t *win, struct ompi_request_t **request); -typedef int (*ompi_osc_base_module_rget_with_notify_fn_t)(void *origin_addr, +typedef int (*ompi_osc_base_module_rget_notify_fn_t)(void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, int target, @@ -418,18 +418,18 @@ struct ompi_osc_base_module_4_0_0_t { ompi_osc_base_module_free_fn_t osc_free; ompi_osc_base_module_put_fn_t osc_put; - ompi_osc_base_module_put_with_notify_fn_t osc_put_with_notify; + ompi_osc_base_module_put_notify_fn_t osc_put_notify; ompi_osc_base_module_get_fn_t osc_get; - ompi_osc_base_module_get_with_notify_fn_t osc_get_with_notify; + ompi_osc_base_module_get_notify_fn_t osc_get_notify; ompi_osc_base_module_accumulate_fn_t osc_accumulate; ompi_osc_base_module_compare_and_swap_fn_t osc_compare_and_swap; ompi_osc_base_module_fetch_and_op_fn_t osc_fetch_and_op; ompi_osc_base_module_get_accumulate_fn_t osc_get_accumulate; ompi_osc_base_module_rput_fn_t osc_rput; - ompi_osc_base_module_rput_with_notify_fn_t osc_rput_with_notify; + ompi_osc_base_module_rput_notify_fn_t osc_rput_notify; ompi_osc_base_module_rget_fn_t osc_rget; - ompi_osc_base_module_rget_with_notify_fn_t osc_rget_with_notify; + ompi_osc_base_module_rget_notify_fn_t osc_rget_notify; ompi_osc_base_module_raccumulate_fn_t osc_raccumulate; ompi_osc_base_module_rget_accumulate_fn_t osc_rget_accumulate; diff --git a/ompi/mca/osc/sm/osc_sm.h b/ompi/mca/osc/sm/osc_sm.h index b7d6dadfd49..200ec8b3de8 100644 --- a/ompi/mca/osc/sm/osc_sm.h +++ b/ompi/mca/osc/sm/osc_sm.h @@ -107,7 +107,7 @@ int ompi_osc_sm_detach(struct ompi_win_t *win, const void *base); int ompi_osc_sm_free(struct ompi_win_t *win); -// TODO: add put/get_with_notify prototypes +// TODO: add put/get_notify prototypes int ompi_osc_sm_put(const void *origin_addr, size_t origin_count, @@ -118,7 +118,7 @@ int ompi_osc_sm_put(const void *origin_addr, struct ompi_datatype_t *target_dt, struct ompi_win_t *win); - int ompi_osc_sm_put_with_notify(const void *origin_addr, + int ompi_osc_sm_put_notify(const void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, int target, @@ -137,7 +137,7 @@ int ompi_osc_sm_get(void *origin_addr, struct ompi_datatype_t *target_dt, struct ompi_win_t *win); -int ompi_osc_sm_get_with_notify(void *origin_addr, +int ompi_osc_sm_get_notify(void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, int target, @@ -196,7 +196,7 @@ int ompi_osc_sm_rput(const void *origin_addr, struct ompi_win_t *win, struct ompi_request_t **request); -int ompi_osc_sm_rput_with_notify(const void *origin_addr, +int ompi_osc_sm_rput_notify(const void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, int target, @@ -217,7 +217,7 @@ int ompi_osc_sm_rget(void *origin_addr, struct ompi_win_t *win, struct ompi_request_t **request); -int ompi_osc_sm_rget_with_notify(void *origin_addr, +int ompi_osc_sm_rget_notify(void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, int target, diff --git a/ompi/mca/osc/sm/osc_sm_comm.c b/ompi/mca/osc/sm/osc_sm_comm.c index ba19d8c08cf..4391a375ebc 100644 --- a/ompi/mca/osc/sm/osc_sm_comm.c +++ b/ompi/mca/osc/sm/osc_sm_comm.c @@ -60,7 +60,7 @@ ompi_osc_sm_rput(const void *origin_addr, } int -ompi_osc_sm_rput_with_notify(const void *origin_addr, +ompi_osc_sm_rput_notify(const void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, int target, @@ -143,7 +143,7 @@ ompi_osc_sm_rget(void *origin_addr, } int -ompi_osc_sm_rget_with_notify(void *origin_addr, +ompi_osc_sm_rget_notify(void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, int target, @@ -323,7 +323,7 @@ ompi_osc_sm_put(const void *origin_addr, int -ompi_osc_sm_put_with_notify(const void *origin_addr, +ompi_osc_sm_put_notify(const void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, int target, @@ -392,7 +392,7 @@ ompi_osc_sm_get(void *origin_addr, int -ompi_osc_sm_get_with_notify(void *origin_addr, +ompi_osc_sm_get_notify(void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, int target, diff --git a/ompi/mca/osc/sm/osc_sm_component.c b/ompi/mca/osc/sm/osc_sm_component.c index 11f0ccc2e47..e7613c86f6e 100644 --- a/ompi/mca/osc/sm/osc_sm_component.c +++ b/ompi/mca/osc/sm/osc_sm_component.c @@ -79,18 +79,18 @@ ompi_osc_sm_module_t ompi_osc_sm_module_template = { .osc_free = ompi_osc_sm_free, .osc_put = ompi_osc_sm_put, - .osc_put_with_notify = ompi_osc_sm_put_with_notify, + .osc_put_notify = ompi_osc_sm_put_notify, .osc_get = ompi_osc_sm_get, - .osc_get_with_notify = ompi_osc_sm_get_with_notify, + .osc_get_notify = ompi_osc_sm_get_notify, .osc_accumulate = ompi_osc_sm_accumulate, .osc_compare_and_swap = ompi_osc_sm_compare_and_swap, .osc_fetch_and_op = ompi_osc_sm_fetch_and_op, .osc_get_accumulate = ompi_osc_sm_get_accumulate, .osc_rput = ompi_osc_sm_rput, - .osc_rput_with_notify = ompi_osc_sm_rput_with_notify, + .osc_rput_notify = ompi_osc_sm_rput_notify, .osc_rget = ompi_osc_sm_rget, - .osc_rget_with_notify = ompi_osc_sm_rget_with_notify, + .osc_rget_notify = ompi_osc_sm_rget_notify, .osc_raccumulate = ompi_osc_sm_raccumulate, .osc_rget_accumulate = ompi_osc_sm_rget_accumulate, diff --git a/ompi/mpi/bindings/ompi_bindings/consts.py b/ompi/mpi/bindings/ompi_bindings/consts.py index 43bca486b57..759b342f64a 100644 --- a/ompi/mpi/bindings/ompi_bindings/consts.py +++ b/ompi/mpi/bindings/ompi_bindings/consts.py @@ -23,6 +23,7 @@ 'MPI_SUCCESS', 'MPI_ERR_BUFFER', 'MPI_ERR_COUNT', + 'MPI_ERR_NOTIFY_IDX' 'MPI_ERR_TYPE', 'MPI_ERR_TAG', 'MPI_ERR_COMM', diff --git a/ompi/mpi/c/Makefile.am b/ompi/mpi/c/Makefile.am index 5dbd959cd44..0d83fa055b1 100644 --- a/ompi/mpi/c/Makefile.am +++ b/ompi/mpi/c/Makefile.am @@ -220,6 +220,7 @@ prototype_sources = \ get_accumulate.c.in \ get_address.c.in \ get.c.in \ + get_notify.c.in \ get_count.c.in \ get_elements.c.in \ get_elements_x.c.in \ @@ -338,6 +339,7 @@ prototype_sources = \ psend_init.c.in \ publish_name.c.in \ put.c.in \ + put_notify.c.in \ query_thread.c.in \ raccumulate.c.in \ recv.c.in \ diff --git a/ompi/mpi/c/get_notify.c.in b/ompi/mpi/c/get_notify.c.in new file mode 100644 index 00000000000..1bad16944ab --- /dev/null +++ b/ompi/mpi/c/get_notify.c.in @@ -0,0 +1,77 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2024 Triad National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/runtime/ompi_spc.h" + +PROTOTYPE ERROR_CLASS get_notify(BUFFER_OUT origin_addr, COUNT origin_count, + DATATYPE origin_datatype, INT target_rank, + AINT target_disp, COUNT target_count, + DATATYPE target_datatype, INT notification_idx, WIN win) +{ + int rc; + + SPC_RECORD(OMPI_SPC_GET_NOTIFY, 1); + + if (MPI_PARAM_CHECK) { + rc = OMPI_SUCCESS; + + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_WIN, FUNC_NAME); + } else if (origin_count < 0 || target_count < 0) { + rc = MPI_ERR_COUNT; + } else if (ompi_win_peer_invalid(win, target_rank) && + (MPI_PROC_NULL != target_rank)) { + rc = MPI_ERR_RANK; + } else if ( MPI_WIN_FLAVOR_DYNAMIC != win->w_flavor && target_disp < 0 ) { + rc = MPI_ERR_DISP; + } else if (notification_idx < 0) { + rc = MPI_ERR_NOTIFY_IDX; + } else { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, origin_datatype, origin_count); + if (OMPI_SUCCESS == rc) { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, target_datatype, target_count); + } + } + OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME); + } + + if (MPI_PROC_NULL == target_rank) return MPI_SUCCESS; + + rc = win->w_osc_module->osc_get_notify(origin_addr, origin_count, origin_datatype, + target_rank, target_disp, target_count, + target_datatype, notification_idx, win); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/c/put_notify.c.in b/ompi/mpi/c/put_notify.c.in new file mode 100644 index 00000000000..14ee5c7e365 --- /dev/null +++ b/ompi/mpi/c/put_notify.c.in @@ -0,0 +1,80 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2024 Triad National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/runtime/ompi_spc.h" + +PROTOTYPE ERROR_CLASS put_notify(BUFFER origin_addr, COUNT origin_count, DATATYPE origin_datatype, + INT target_rank, AINT target_disp, COUNT target_count, + DATATYPE target_datatype, INT notification_idx, WIN win) +{ + int rc; + + SPC_RECORD(OMPI_SPC_PUT_NOTIFY, 1); + + if (MPI_PARAM_CHECK) { + rc = OMPI_SUCCESS; + + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_WIN, FUNC_NAME); + } else if (origin_count < 0 || target_count < 0) { + rc = MPI_ERR_COUNT; + } else if (ompi_win_peer_invalid(win, target_rank) && + (MPI_PROC_NULL != target_rank)) { + rc = MPI_ERR_RANK; + } else if (NULL == target_datatype || + MPI_DATATYPE_NULL == target_datatype) { + rc = MPI_ERR_TYPE; + } else if ( MPI_WIN_FLAVOR_DYNAMIC != win->w_flavor && target_disp < 0 ) { + rc = MPI_ERR_DISP; + } else if (notification_idx < 0) { + rc = MPI_ERR_NOTIFY_IDX; + } else { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, origin_datatype, origin_count); + if (OMPI_SUCCESS == rc) { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, target_datatype, target_count); + } + } + OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME); + } + + if (MPI_PROC_NULL == target_rank) return MPI_SUCCESS; + + rc = win->w_osc_module->osc_put_notify(origin_addr, origin_count, origin_datatype, + target_rank, target_disp, target_count, + target_datatype, notification_idx, win); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/runtime/ompi_spc.c b/ompi/runtime/ompi_spc.c index 6f1d8aa7d6a..1d25545c80b 100644 --- a/ompi/runtime/ompi_spc.c +++ b/ompi/runtime/ompi_spc.c @@ -71,8 +71,10 @@ static const ompi_spc_event_t ompi_spc_events_desc[OMPI_SPC_NUM_COUNTERS] = { SET_COUNTER_ARRAY(OMPI_SPC_SENDRECV, "The number of times MPI_Sendrecv was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_SENDRECV_REPLACE, "The number of times MPI_Sendrecv_replace was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_PUT, "The number of times MPI_Put was called.", false, false), + SET_COUNTER_ARRAY(OMPI_SPC_PUT_NOTIFY, "The number of times MPI_Put_notify was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_RPUT, "The number of times MPI_Rput was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_GET, "The number of times MPI_Get was called.", false, false), + SET_COUNTER_ARRAY(OMPI_SPC_GET_NOTIFY, "The number of times MPI_Get was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_RGET, "The number of times MPI_Rget was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_PROBE, "The number of times MPI_Probe was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_IPROBE, "The number of times MPI_Iprobe was called.", false, false), diff --git a/ompi/runtime/ompi_spc.h b/ompi/runtime/ompi_spc.h index 76ec7f25f16..3d0efd257b3 100644 --- a/ompi/runtime/ompi_spc.h +++ b/ompi/runtime/ompi_spc.h @@ -58,8 +58,10 @@ typedef enum ompi_spc_counters { OMPI_SPC_SENDRECV, OMPI_SPC_SENDRECV_REPLACE, OMPI_SPC_PUT, + OMPI_SPC_PUT_NOTIFY, OMPI_SPC_RPUT, OMPI_SPC_GET, + OMPI_SPC_GET_NOTIFY, OMPI_SPC_RGET, OMPI_SPC_PROBE, OMPI_SPC_IPROBE, From 4eee1f68f9ddab0cfb085faec7060c08835e8e7b Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Thu, 4 Dec 2025 14:05:32 -0700 Subject: [PATCH 35/51] pmix:advance sha to d30c15e6 to pick up a fix for MPI sessions tests Signed-off-by: Howard Pritchard --- 3rd-party/openpmix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rd-party/openpmix b/3rd-party/openpmix index 632bc703f93..d30c15e65f3 160000 --- a/3rd-party/openpmix +++ b/3rd-party/openpmix @@ -1 +1 @@ -Subproject commit 632bc703f9352655de70263313ffb77879ee4e37 +Subproject commit d30c15e65f3e6f154807424dd428698fe9fbad3a From 02a63778d7c2988ed4dcb5ed09d52034bfff447f Mon Sep 17 00:00:00 2001 From: Brelle Emmanuel Date: Mon, 8 Dec 2025 15:02:45 +0100 Subject: [PATCH 36/51] [UBCL] Fixed warnings to build with -Werror Signed-off-by: Brelle Emmanuel --- ompi/mca/osc/ubcl/osc_ubcl_accumulate.c | 4 ++-- ompi/mca/osc/ubcl/osc_ubcl_get.c | 1 - ompi/mca/osc/ubcl/osc_ubcl_put.c | 1 - ompi/mca/pml/ubcl/pml_ubcl_component.c | 2 +- ompi/mca/pml/ubcl/pml_ubcl_endpoint.c | 2 +- ompi/mca/pml/ubcl/pml_ubcl_isend.c | 4 ++-- ompi/mca/pml/ubcl/pml_ubcl_request.c | 10 +++++----- 7 files changed, 11 insertions(+), 13 deletions(-) diff --git a/ompi/mca/osc/ubcl/osc_ubcl_accumulate.c b/ompi/mca/osc/ubcl/osc_ubcl_accumulate.c index 59644e829f3..58756665ee1 100644 --- a/ompi/mca/osc/ubcl/osc_ubcl_accumulate.c +++ b/ompi/mca/osc/ubcl/osc_ubcl_accumulate.c @@ -26,7 +26,7 @@ #include "ompi/mca/osc/ubcl/osc_ubcl_request.h" #include "ompi/mca/common/ubcl/common_ubcl.h" -int get_ubcl_int_type(size_t size, bool is_signed, ubcl_win_atomic_datatype_t *ubcl_type) +static int get_ubcl_int_type(size_t size, bool is_signed, ubcl_win_atomic_datatype_t *ubcl_type) { int ret = OMPI_SUCCESS; @@ -51,7 +51,7 @@ int get_ubcl_int_type(size_t size, bool is_signed, ubcl_win_atomic_datatype_t *u return ret; } -int get_ubcl_fp_type(size_t size, ubcl_win_atomic_datatype_t *ubcl_type) +static int get_ubcl_fp_type(size_t size, ubcl_win_atomic_datatype_t *ubcl_type) { int ret = OMPI_SUCCESS; diff --git a/ompi/mca/osc/ubcl/osc_ubcl_get.c b/ompi/mca/osc/ubcl/osc_ubcl_get.c index 6dd42ef4240..f0fb8ad7706 100644 --- a/ompi/mca/osc/ubcl/osc_ubcl_get.c +++ b/ompi/mca/osc/ubcl/osc_ubcl_get.c @@ -139,7 +139,6 @@ int ompi_osc_ubcl_rget(void *origin_addr, size_t origin_count, target_iov[0].iov_base = (char *) target_addr + gap; target_iov[0].iov_len = span; } else { - int ret = OMPI_SUCCESS; ret = osc_ubcl_build_ddt_iov(target_addr, proc, target_count, target_dt, &target_iov, &target_iov_count); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { diff --git a/ompi/mca/osc/ubcl/osc_ubcl_put.c b/ompi/mca/osc/ubcl/osc_ubcl_put.c index 1a3878410ec..ae45c45d511 100644 --- a/ompi/mca/osc/ubcl/osc_ubcl_put.c +++ b/ompi/mca/osc/ubcl/osc_ubcl_put.c @@ -147,7 +147,6 @@ int ompi_osc_ubcl_rput(const void *origin_addr, size_t origin_count, target_iov[0].iov_base = (char *) target_addr + gap; target_iov[0].iov_len = span; } else { - int ret = OMPI_SUCCESS; ret = osc_ubcl_build_ddt_iov(target_addr, proc, target_count, target_dt, &target_iov, &target_iov_count); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { diff --git a/ompi/mca/pml/ubcl/pml_ubcl_component.c b/ompi/mca/pml/ubcl/pml_ubcl_component.c index 40eef2c9291..8c8dbf06080 100644 --- a/ompi/mca/pml/ubcl/pml_ubcl_component.c +++ b/ompi/mca/pml/ubcl/pml_ubcl_component.c @@ -191,7 +191,7 @@ int mca_pml_ubcl_component_register(void) return OMPI_SUCCESS; } -static void mca_pml_ubcl_check_cuda_accelerator() +static void mca_pml_ubcl_check_cuda_accelerator(void) { const char* cuda_component_name = "cuda"; const char* selected_component_name = opal_accelerator_base_selected_component.base_version.mca_component_name; diff --git a/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c index 04e29babed9..8f17e8f3bea 100644 --- a/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c +++ b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c @@ -309,7 +309,7 @@ void mca_pml_ubcl_endpoint_retain(ompi_proc_t *proc) OBJ_RETAIN(proc); } -int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc) +static int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc) { int err = OMPI_SUCCESS; mca_common_ubcl_endpoint_t *new_endpoint; diff --git a/ompi/mca/pml/ubcl/pml_ubcl_isend.c b/ompi/mca/pml/ubcl/pml_ubcl_isend.c index 9d5b282d884..3bd19f0852c 100644 --- a/ompi/mca/pml/ubcl/pml_ubcl_isend.c +++ b/ompi/mca/pml/ubcl/pml_ubcl_isend.c @@ -100,7 +100,7 @@ static inline void mca_pml_ubcl_isend_prepare(const void *buf, size_t count, void mca_pml_ubcl_isend_start(struct ompi_request_t **request) { OPAL_OUTPUT_VERBOSE( - (50, mca_pml_ubcl_component.output, "UBCL_MODULE_ISEND_START %p\n", *request)); + (50, mca_pml_ubcl_component.output, "UBCL_MODULE_ISEND_START %p\n", *(void **)request)); mca_pml_ubcl_request_t *req = container_of((*request), mca_pml_ubcl_request_t, ompi_req); @@ -167,7 +167,7 @@ void mca_pml_ubcl_isend_start(struct ompi_request_t **request) OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "PML/UBCL ISEND: send mpi_tag=%x comm_id=%zu\n", tag, ubcl_cid.bits)); OPAL_OUTPUT_VERBOSE( - (50, mca_pml_ubcl_component.output, "PML/UBCL ISEND: ompi_req=%p\n", *request)); + (50, mca_pml_ubcl_component.output, "PML/UBCL ISEND: ompi_req=%p\n", *(void **)request)); OPAL_OUTPUT_VERBOSE( (50, mca_pml_ubcl_component.output, "PML/UBCL ISEND: sending to rank=%zu\n", endpoint->rank)); diff --git a/ompi/mca/pml/ubcl/pml_ubcl_request.c b/ompi/mca/pml/ubcl/pml_ubcl_request.c index 282f87711a0..b5a206f504c 100644 --- a/ompi/mca/pml/ubcl/pml_ubcl_request.c +++ b/ompi/mca/pml/ubcl/pml_ubcl_request.c @@ -177,7 +177,7 @@ mca_pml_ubcl_request_cancel(struct ompi_request_t *request, int complete) return OMPI_SUCCESS; } -int mca_pml_ubcl_request_complete(struct ompi_request_t *request) +static int mca_pml_ubcl_request_complete(struct ompi_request_t *request) { /* Null check */ if (MPI_REQUEST_NULL == request) { @@ -248,7 +248,7 @@ void ubcl_request_send_complete_cb(ubcl_status_t status, void *cb_data) } OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, - "PML/UBCL SEND_COMPLETE pml_req=%p mpi_tag=%x\n", req, req->tag)); + "PML/UBCL SEND_COMPLETE pml_req=%p mpi_tag=%x\n", (void *)req, req->tag)); /** mca_pml_ubcl_request_complete((ompi_request_t *) cb_data); */ } @@ -284,7 +284,7 @@ void ubcl_request_recv_complete_cb(ubcl_status_t status, void *cb_data) ompi_request_complete(&(req->ompi_req), true); OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, - "PML/UBCL RECV_COMPLETE pml_req=%p mpi_tag=%d\n", req, req->tag)); + "PML/UBCL RECV_COMPLETE pml_req=%p mpi_tag=%d\n", (void *)req, req->tag)); /** mca_pml_ubcl_request_complete((ompi_request_t *) cb_data); */ } @@ -296,7 +296,7 @@ void ubcl_request_recv_complete_cb(ubcl_status_t status, void *cb_data) void mca_pml_ubcl_request_finalize(mca_pml_ubcl_request_t *req) { OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, - "PML/UBCL REQUEST_FINALIZE BEGIN pml_req=%p mpi_tag=%x\n", req, req->tag)); + "PML/UBCL REQUEST_FINALIZE BEGIN pml_req=%p mpi_tag=%x\n", (void *)req, req->tag)); opal_convertor_cleanup(&req->convertor); OBJ_DESTRUCT(&req->convertor); @@ -308,7 +308,7 @@ void mca_pml_ubcl_request_finalize(mca_pml_ubcl_request_t *req) opal_free_list_return(&mca_pml_ubcl_component.pml_req_free_list, (opal_free_list_item_t *) req); OPAL_OUTPUT_VERBOSE( - (50, mca_pml_ubcl_component.output, "PML/UBCL REQUEST_FINALIZED %p\n", req)); + (50, mca_pml_ubcl_component.output, "PML/UBCL REQUEST_FINALIZED %p\n", (void *)req)); } From 81adcc0245e79ef3ccfda9bff2a367a75ff2162d Mon Sep 17 00:00:00 2001 From: Brelle Emmanuel Date: Thu, 4 Dec 2025 13:49:17 +0100 Subject: [PATCH 37/51] [OSC/UBCL] Fix returned error codes with overlapping exposure epochs Signed-off-by: Brelle Emmanuel --- ompi/mca/osc/ubcl/osc_ubcl_sync.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ompi/mca/osc/ubcl/osc_ubcl_sync.c b/ompi/mca/osc/ubcl/osc_ubcl_sync.c index b47a682feda..5b4f9feeab4 100644 --- a/ompi/mca/osc/ubcl/osc_ubcl_sync.c +++ b/ompi/mca/osc/ubcl/osc_ubcl_sync.c @@ -192,7 +192,7 @@ int ompi_osc_ubcl_lock(int lock_type, int target, int assert, struct ompi_win_t /* check synchronization type */ if (UBCL_WIN_SYNC_NONE != module->sync_type && UBCL_WIN_SYNC_LOCK != module->sync_type && UBCL_WIN_SYNC_FENCE != module->sync_type) { - ret = OMPI_ERR_RMA_CONFLICT; + ret = OMPI_ERR_RMA_SYNC; mca_osc_ubcl_warn(ret, "Failed to lock window %s already in sync type %s", win->w_name, osc_ubcl_sync_name(module->sync_type)); goto return_locked; @@ -219,7 +219,7 @@ int ompi_osc_ubcl_lock(int lock_type, int target, int assert, struct ompi_win_t /* check access epoch */ if (UBCL_WIN_SYNC_NONE != module->procs_sync_type[target]) { - ret = OMPI_ERR_RMA_CONFLICT; + ret = OMPI_ERR_RMA_SYNC; mca_osc_ubcl_warn(ret, "Target %d is already locked on window %s", target, win->w_name); goto return_locked; @@ -271,7 +271,7 @@ int ompi_osc_ubcl_unlock(int target, struct ompi_win_t *win) if (UBCL_WIN_SYNC_LOCK != module->sync_type || (UBCL_WIN_SYNC_LOCK != module->procs_sync_type[target] && UBCL_WIN_SYNC_LOCK_NO_CHECK != module->procs_sync_type[target])) { - ret = OMPI_ERR_RMA_CONFLICT; + ret = OMPI_ERR_RMA_SYNC; mca_osc_ubcl_warn(ret, "Target %d is not locked so it cannot be unlocked " "window %s (sync type %s)", target, win->w_name, osc_ubcl_sync_name(module->sync_type)); @@ -344,7 +344,7 @@ static int get_all_ubcl_ranks(struct ompi_win_t *win, ubcl_rank_t *all_ranks) /* lock_all doesn't need to check the exposure epoch because if there was another * one started (individual lock or lock_all) then module->sync_type would be - * different from UBCL_WIN_SYNC_NONE therefore returning OMPI_ERR_RMA_CONFLICT. + * different from UBCL_WIN_SYNC_NONE therefore returning OMPI_ERR_RMA_SYNC. * Stemming from this, unlock_all doesn't need to check the epoch either */ int ompi_osc_ubcl_lock_all(int assert, struct ompi_win_t *win) @@ -359,7 +359,7 @@ int ompi_osc_ubcl_lock_all(int assert, struct ompi_win_t *win) /* check access epoch */ if (UBCL_WIN_SYNC_NONE != module->sync_type && UBCL_WIN_SYNC_FENCE != module->sync_type) { - ret = OMPI_ERR_RMA_CONFLICT; + ret = OMPI_ERR_RMA_SYNC; mca_osc_ubcl_warn(ret, "Failed to lock_all window %s already in sync type %s", win->w_name, osc_ubcl_sync_name(module->sync_type)); return ret; @@ -413,7 +413,7 @@ int ompi_osc_ubcl_unlock_all(struct ompi_win_t *win) /* check access epoch */ if (UBCL_WIN_SYNC_LOCK_ALL != module->sync_type) { - return OMPI_ERR_RMA_CONFLICT; + return OMPI_ERR_RMA_SYNC; } group_size = ompi_group_size(win->w_group); @@ -513,7 +513,7 @@ int ompi_osc_ubcl_complete(struct ompi_win_t *win) OPAL_THREAD_LOCK(&module->sync_lock); if (UBCL_WIN_SYNC_PSCW != module->sync_type) { - ret = OMPI_ERR_RMA_CONFLICT; + ret = OMPI_ERR_RMA_SYNC; mca_osc_ubcl_warn(ret, "Failed to complete window %s in sync type %s", win->w_name, osc_ubcl_sync_name(module->sync_type)); goto return_locked; @@ -596,7 +596,7 @@ int ompi_osc_ubcl_post(struct ompi_group_t *group, int assert, struct ompi_win_t || ( UBCL_WIN_SYNC_NONE != module->sync_type && UBCL_WIN_SYNC_FENCE != module->sync_type && UBCL_WIN_SYNC_PSCW != module->sync_type )) { - ret = OMPI_ERR_RMA_CONFLICT; + ret = OMPI_ERR_RMA_SYNC; mca_osc_ubcl_warn(ret, "Failed to post window %s already in sync type %s", win->w_name, osc_ubcl_sync_name(module->sync_type)); goto return_locked; @@ -662,7 +662,7 @@ int ompi_osc_ubcl_test(struct ompi_win_t *win, int *flag) } if (UBCL_WIN_SYNC_PSCW != module->sync_type) { - ret = OMPI_ERR_RMA_CONFLICT; + ret = OMPI_ERR_RMA_SYNC; mca_osc_ubcl_warn(ret, "Failed to test window %s in sync type %s", win->w_name, osc_ubcl_sync_name(module->sync_type)); goto return_locked; @@ -742,7 +742,7 @@ int ompi_osc_ubcl_fence(int assert, struct ompi_win_t *win) if (UBCL_WIN_SYNC_FENCE != module->sync_type && UBCL_WIN_SYNC_FENCE_EPOCH != module->sync_type && UBCL_WIN_SYNC_NONE != module->sync_type) { - ret = OMPI_ERR_RMA_CONFLICT; + ret = OMPI_ERR_RMA_SYNC; mca_osc_ubcl_warn(ret, "Failed to fence window %s in sync type %s", win->w_name, osc_ubcl_sync_name(module->sync_type)); return ret; From a0acc73d1dd1b0ab3c1021d45cdeed3b600c5ed0 Mon Sep 17 00:00:00 2001 From: Brelle Emmanuel Date: Thu, 4 Dec 2025 14:36:59 +0100 Subject: [PATCH 38/51] [OSC/UBCL] Window infos violation are not be fatal, just prints a warning and returns an error Signed-off-by: Brelle Emmanuel --- ompi/mca/osc/ubcl/osc_ubcl_sync.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ompi/mca/osc/ubcl/osc_ubcl_sync.c b/ompi/mca/osc/ubcl/osc_ubcl_sync.c index 5b4f9feeab4..afca78a072e 100644 --- a/ompi/mca/osc/ubcl/osc_ubcl_sync.c +++ b/ompi/mca/osc/ubcl/osc_ubcl_sync.c @@ -184,7 +184,8 @@ int ompi_osc_ubcl_lock(int lock_type, int target, int assert, struct ompi_win_t mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; if (module->no_locks) { - mca_osc_ubcl_error(OMPI_ERR_RMA_SYNC, "MPI_Win_lock : window %d is no_locks=true", module->wid); + mca_osc_ubcl_warn(OMPI_ERR_RMA_SYNC, "MPI_Win_lock : window %d is no_locks=true", module->wid); + return OMPI_ERR_RMA_SYNC; } OPAL_THREAD_LOCK(&module->sync_lock); @@ -262,7 +263,8 @@ int ompi_osc_ubcl_unlock(int target, struct ompi_win_t *win) mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; if (module->no_locks) { - mca_osc_ubcl_error(OMPI_ERR_RMA_SYNC, "MPI_Win_unlock : window %d is no_locks=true", module->wid); + mca_osc_ubcl_warn(OMPI_ERR_RMA_SYNC, "MPI_Win_unlock : window %d is no_locks=true", module->wid); + return OMPI_ERR_RMA_SYNC; } OPAL_THREAD_LOCK(&module->sync_lock); @@ -354,7 +356,8 @@ int ompi_osc_ubcl_lock_all(int assert, struct ompi_win_t *win) mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; if (module->no_locks) { - mca_osc_ubcl_error(OMPI_ERR_RMA_SYNC, "MPI_Win_lockall : window %d is no_locks=true", module->wid); + mca_osc_ubcl_warn(OMPI_ERR_RMA_SYNC, "MPI_Win_lockall : window %d is no_locks=true", module->wid); + return OMPI_ERR_RMA_SYNC; } /* check access epoch */ @@ -401,7 +404,8 @@ int ompi_osc_ubcl_unlock_all(struct ompi_win_t *win) mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; if (module->no_locks) { - mca_osc_ubcl_error(OMPI_ERR_RMA_SYNC, "MPI_Win_unlockall : window %d is no_locks=true", module->wid); + mca_osc_ubcl_warn(OMPI_ERR_RMA_SYNC, "MPI_Win_unlockall : window %d is no_locks=true", module->wid); + return OMPI_ERR_RMA_SYNC; } if (UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK == module->sync_type) { From 9ca90fa6ea7047060a190e55384aeef07e180789 Mon Sep 17 00:00:00 2001 From: "DUPRAT, JULIEN" Date: Wed, 22 Oct 2025 14:47:59 +0200 Subject: [PATCH 39/51] [UBCL] No bxi endpoints if we don't need them Signed-off-by: Brelle Emmanuel --- ompi/mca/pml/ubcl/pml_ubcl_endpoint.c | 55 ++++++++++++++------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c index 8f17e8f3bea..509ad505011 100644 --- a/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c +++ b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c @@ -111,7 +111,7 @@ static int mca_pml_ubcl_export_local_endpoint_handle(const int type) err = ubcl_export_local_endpoint_handle(type, endpoint_h, &remote_rank_u64); if (UBCL_SUCCESS != err) { - return OMPI_ERROR; + return ubcl_error_to_ompi(err); } mca_pml_ubcl_endpoint_modex_put(type, (void *) endpoint_h, size); @@ -120,10 +120,10 @@ static int mca_pml_ubcl_export_local_endpoint_handle(const int type) * The actual recv rank will be allocated during add_procs calls */ err = ubcl_close_local_endpoint_channel(type, remote_rank_u64); if (UBCL_SUCCESS != err) { - mca_pml_ubcl_warn(OMPI_ERROR, + mca_pml_ubcl_warn(ubcl_error_to_ompi(err), "PML/UBCL failed to clean local endpoint (very unlikely error)." " For safety reason PML will be disabled."); - return OMPI_ERROR; + return ubcl_error_to_ompi(err); } return OMPI_SUCCESS; @@ -133,35 +133,31 @@ int mca_pml_ubcl_create_local_endpoint(void) { int type; ubcl_error_t err; - int ompi_error; type = UBCL_ENDPOINT_TYPE_SELF; err = ubcl_create_local_endpoint(type); if (UBCL_SUCCESS != err) { - mca_pml_ubcl_error(OMPI_ERROR, "Failed ubcl_create_local_endpoint %d (%d)", type, err); + mca_pml_ubcl_warn(ubcl_error_to_ompi(err), "Failed ubcl_create_local_endpoint %d (%d)", type, err); } - /* UBCL_ENDPOINT_SHM */ if (!mca_pml_ubcl_component.force_intranode_bxi) { type = UBCL_ENDPOINT_TYPE_SHMEM; err = ubcl_create_local_endpoint(type); - if (UBCL_SUCCESS != err) { - mca_pml_ubcl_error(OMPI_ERROR, "Failed ubcl_create_local_endpoint %d (%d)", type, err); + if (UBCL_SUCCESS == err) { + err = mca_pml_ubcl_export_local_endpoint_handle(type); } - ompi_error = mca_pml_ubcl_export_local_endpoint_handle(type); - if (OMPI_SUCCESS != ompi_error) { - return ompi_error; + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_warn(ubcl_error_to_ompi(err), "Failed ubcl_create_local_endpoint %d (%d)", type, err); } } type = UBCL_ENDPOINT_TYPE_BXI; err = ubcl_create_local_endpoint(type); - if (UBCL_SUCCESS != err) { - mca_pml_ubcl_error(OMPI_ERROR, "Failed ubcl_create_local_endpoint %d (%d)", type, err); + if (UBCL_SUCCESS == err) { + err = mca_pml_ubcl_export_local_endpoint_handle(type); } - ompi_error = mca_pml_ubcl_export_local_endpoint_handle(type); - if (OMPI_SUCCESS != ompi_error) { - return ompi_error; + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_warn(ubcl_error_to_ompi(err), "Failed ubcl_create_local_endpoint %d (%d)", type, err); } return OMPI_SUCCESS; @@ -170,20 +166,23 @@ int mca_pml_ubcl_create_local_endpoint(void) int mca_pml_ubcl_free_local_endpoints() { int ret; - /* Finalize BXI */ ret = ubcl_free_local_endpoint(UBCL_ENDPOINT_TYPE_BXI); - if (UBCL_SUCCESS != ret) { - return OMPI_ERROR; + if (UBCL_SUCCESS != ret && UBCL_ERR_NOT_AVAILABLE != ret) { + /* If the transport was unavailable we silence the error, + * we're closing it anyway */ + return ubcl_error_to_ompi(ret); } + if (!mca_pml_ubcl_component.force_intranode_bxi) { ret = ubcl_free_local_endpoint(UBCL_ENDPOINT_TYPE_SHMEM); - if (UBCL_SUCCESS != ret) { - return OMPI_ERROR; + if (UBCL_SUCCESS != ret && UBCL_ERR_NOT_AVAILABLE != ret) { + return ubcl_error_to_ompi(ret); } } + ret = ubcl_free_local_endpoint(UBCL_ENDPOINT_TYPE_SELF); - if (UBCL_SUCCESS != ret) { - return OMPI_ERROR; + if (UBCL_SUCCESS != ret && UBCL_ERR_NOT_AVAILABLE != ret) { + return ubcl_error_to_ompi(ret); } return OMPI_SUCCESS; @@ -331,14 +330,16 @@ static int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc) err = mca_pml_ubcl_create_recv_endpoint(new_endpoint->rank, new_endpoint->type); if (OMPI_SUCCESS != err) { - mca_pml_ubcl_error(err, "Failed to create recv endpoint for rank %zu\n", - new_endpoint->rank); + mca_pml_ubcl_warn(err, "Failed to create recv endpoint for rank %zu\n", + new_endpoint->rank); + return err; } err = mca_pml_ubcl_create_send_endpoint(proc, new_endpoint->rank, new_endpoint->type); if (OMPI_SUCCESS != err) { - mca_pml_ubcl_error(err, "Failed to create send endpoint for rank %zu\n", - new_endpoint->rank); + mca_pml_ubcl_warn(err, "Failed to create send endpoint for rank %zu\n", + new_endpoint->rank); + return err; } end: From 959dc301d77e426a9a5b2f6f8d1c4e715cb6b19b Mon Sep 17 00:00:00 2001 From: "DUPRAT, JULIEN" Date: Mon, 3 Nov 2025 12:05:28 +0100 Subject: [PATCH 40/51] [PML/UBCL] add_procs fallbacks on higher transport when required Signed-off-by: Brelle Emmanuel --- ompi/mca/pml/ubcl/pml_ubcl_endpoint.c | 71 +++++++++++++++++++++------ 1 file changed, 57 insertions(+), 14 deletions(-) diff --git a/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c index 509ad505011..400384696e6 100644 --- a/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c +++ b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c @@ -254,7 +254,7 @@ static int mca_pml_ubcl_create_recv_endpoint(uint64_t sender_rank, const int typ err = ubcl_export_local_endpoint_handle(type, endpoint_h, &remote_rank_u64); if (UBCL_SUCCESS != err) { - return OMPI_ERROR; + return ubcl_error_to_ompi(err); } return OMPI_SUCCESS; @@ -269,11 +269,11 @@ static int mca_pml_ubcl_create_self_endpoints(uint64_t remote_rank) err = ubcl_export_local_endpoint_handle(type, endpoint_h, &my_rank); if (UBCL_SUCCESS != err) { - return OMPI_ERROR; + return ubcl_error_to_ompi(err); } err = ubcl_create_remote_endpoint(my_rank, my_rank, type, endpoint_h); if (UBCL_SUCCESS != err) { - return OMPI_ERROR; + return ubcl_error_to_ompi(err); } return OMPI_SUCCESS; @@ -295,6 +295,25 @@ static int get_endpoint_type(ompi_proc_t *proc) } } +static enum ubcl_endpoint_type_t mca_pml_ubcl_get_higher_transport( + enum ubcl_endpoint_type_t type) +{ + switch ((int) type) { + case UBCL_ENDPOINT_TYPE_SELF: + case UBCL_ENDPOINT_TYPE_SHMEM: + type++; + break; + /* There are no valid higher transport */ + case UBCL_ENDPOINT_TYPE_BXI: + default: + type = UBCL_ENDPOINT_TYPE_NONE; + /* Not a valid transport */ + break; + } + + return type; +} + void mca_pml_ubcl_endpoint_retain(ompi_proc_t *proc) { mca_common_ubcl_endpoint_t *endpoint = NULL; @@ -311,6 +330,7 @@ void mca_pml_ubcl_endpoint_retain(ompi_proc_t *proc) static int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc) { int err = OMPI_SUCCESS; + enum ubcl_endpoint_type_t type; mca_common_ubcl_endpoint_t *new_endpoint; new_endpoint = malloc(sizeof(mca_common_ubcl_endpoint_t)); @@ -321,21 +341,43 @@ static int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc) new_endpoint->refcount = 0; //we increment it to 1 in endpoint_retain new_endpoint->rank = mca_pml_forge_rank(proc); - new_endpoint->type = get_endpoint_type(proc); + type = get_endpoint_type(proc); - if (UBCL_ENDPOINT_TYPE_SELF == new_endpoint->type) { + if (UBCL_ENDPOINT_TYPE_SELF == type) { err = mca_pml_ubcl_create_self_endpoints((uint64_t) new_endpoint->rank); - goto end; - } - err = mca_pml_ubcl_create_recv_endpoint(new_endpoint->rank, new_endpoint->type); - if (OMPI_SUCCESS != err) { - mca_pml_ubcl_warn(err, "Failed to create recv endpoint for rank %zu\n", - new_endpoint->rank); - return err; + /* If the transport is unvailable (either explicitely disabled, + * or just unavailable) we do not return any error + * If UBCL encountered another error we return it */ + if (OMPI_SUCCESS == err) { + goto end; + } else if (OMPI_ERR_NOT_AVAILABLE != err) { + return err; + } } - err = mca_pml_ubcl_create_send_endpoint(proc, new_endpoint->rank, new_endpoint->type); + /* If a transport is unavailable only a higher transport can take its place, + * ie. if SHM is unavailable, SELF cannot replace it but BXI can */ + do { + err = mca_pml_ubcl_create_recv_endpoint(new_endpoint->rank, type); + + if (OMPI_ERR_NOT_AVAILABLE == err) { + type = mca_pml_ubcl_get_higher_transport(type); + if (UBCL_ENDPOINT_TYPE_NONE == type) { + mca_pml_ubcl_warn(err, "Failed to create recv endpoint for rank %zu\n", + new_endpoint->rank); + return err; + } + } else if (OMPI_SUCCESS != err) { + mca_pml_ubcl_warn(err, "Failed to create recv endpoint for rank %zu\n", + new_endpoint->rank); + return err; + } + } while (OMPI_SUCCESS != err); + + /* No need to loop again, if the transport became unavailable between + * the last operation and this one we can consider this a error */ + err = mca_pml_ubcl_create_send_endpoint(proc, new_endpoint->rank, type); if (OMPI_SUCCESS != err) { mca_pml_ubcl_warn(err, "Failed to create send endpoint for rank %zu\n", new_endpoint->rank); @@ -343,10 +385,11 @@ static int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc) } end: + new_endpoint->type = type; (proc)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = new_endpoint; mca_pml_ubcl_endpoint_retain(proc); - return err; + return UBCL_SUCCESS; } int mca_pml_ubcl_add_procs(ompi_proc_t **procs, size_t nprocs) From 90532de15fa149801bf8b88aa9f4071a6b404f94 Mon Sep 17 00:00:00 2001 From: "DUPRAT, JULIEN" Date: Mon, 24 Nov 2025 14:37:15 +0100 Subject: [PATCH 41/51] [PML/UBCL] free endpoints when transport init failed Signed-off-by: Brelle Emmanuel --- ompi/mca/pml/ubcl/pml_ubcl_endpoint.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c index 400384696e6..8754aabebc6 100644 --- a/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c +++ b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c @@ -352,7 +352,7 @@ static int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc) if (OMPI_SUCCESS == err) { goto end; } else if (OMPI_ERR_NOT_AVAILABLE != err) { - return err; + goto error; } } @@ -366,12 +366,12 @@ static int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc) if (UBCL_ENDPOINT_TYPE_NONE == type) { mca_pml_ubcl_warn(err, "Failed to create recv endpoint for rank %zu\n", new_endpoint->rank); - return err; + goto error; } } else if (OMPI_SUCCESS != err) { mca_pml_ubcl_warn(err, "Failed to create recv endpoint for rank %zu\n", new_endpoint->rank); - return err; + goto error; } } while (OMPI_SUCCESS != err); @@ -381,7 +381,7 @@ static int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc) if (OMPI_SUCCESS != err) { mca_pml_ubcl_warn(err, "Failed to create send endpoint for rank %zu\n", new_endpoint->rank); - return err; + goto error; } end: @@ -390,6 +390,10 @@ static int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc) mca_pml_ubcl_endpoint_retain(proc); return UBCL_SUCCESS; + +error: + free(new_endpoint); + return err; } int mca_pml_ubcl_add_procs(ompi_proc_t **procs, size_t nprocs) From 50c2cb965867bee0712c64651ddf8a7bb5fadd94 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Wed, 10 Dec 2025 18:57:23 +0000 Subject: [PATCH 42/51] Pass over the tuning section update MPI I/O, network, section. Create a new directory for accelerator related stuff. Co-authored-by: Jeff Squyres Signed-off-by: Edgar Gabriel --- .../{networking => accelerators}/cuda.rst | 0 docs/tuning-apps/accelerators/index.rst | 16 ++ docs/tuning-apps/accelerators/initialize.rst | 39 +++ docs/tuning-apps/accelerators/memkind.rst | 64 +++++ docs/tuning-apps/accelerators/rocm.rst | 269 ++++++++++++++++++ docs/tuning-apps/coll-tuned.rst | 2 +- docs/tuning-apps/index.rst | 1 + docs/tuning-apps/mpi-io.rst | 27 +- docs/tuning-apps/networking/index.rst | 2 - docs/tuning-apps/networking/rocm.rst | 134 --------- docs/tuning-apps/networking/shared-memory.rst | 16 +- 11 files changed, 415 insertions(+), 155 deletions(-) rename docs/tuning-apps/{networking => accelerators}/cuda.rst (100%) create mode 100644 docs/tuning-apps/accelerators/index.rst create mode 100644 docs/tuning-apps/accelerators/initialize.rst create mode 100644 docs/tuning-apps/accelerators/memkind.rst create mode 100644 docs/tuning-apps/accelerators/rocm.rst delete mode 100644 docs/tuning-apps/networking/rocm.rst diff --git a/docs/tuning-apps/networking/cuda.rst b/docs/tuning-apps/accelerators/cuda.rst similarity index 100% rename from docs/tuning-apps/networking/cuda.rst rename to docs/tuning-apps/accelerators/cuda.rst diff --git a/docs/tuning-apps/accelerators/index.rst b/docs/tuning-apps/accelerators/index.rst new file mode 100644 index 00000000000..c6b0ecfcd70 --- /dev/null +++ b/docs/tuning-apps/accelerators/index.rst @@ -0,0 +1,16 @@ +Accelerator support +=================== + +Open MPI supports a variety of different accelerator vendor +eco-systems. This section provides some generic guidance on tuning MPI +applications that use device memory, as well as vendor specific +options. + + +.. toctree:: + :maxdepth: 1 + + initialize + memkind + cuda + rocm diff --git a/docs/tuning-apps/accelerators/initialize.rst b/docs/tuning-apps/accelerators/initialize.rst new file mode 100644 index 00000000000..0bd147f4efb --- /dev/null +++ b/docs/tuning-apps/accelerators/initialize.rst @@ -0,0 +1,39 @@ +Selecting an Accelerator Device before calling MPI_Init +======================================================= + +A common problem when using accelerators arises when selecting which +GPU should be used by an MPI process. The decision is often based by +the rank of that process in ``MPI_COMM_WORLD``. The rank of a process +can however only be retrieved after the MPI library has correctly +initialized. On the other hand, the accelerator resources initialized +during ``MPI_Init`` can have some associations with the `current` +device, which will be the default device used by a particular +eco-system if not set to a different value. + +To circumvent this circular problem, applications are encouraged to +make use of the environment variable ``OMPI_COMM_WORLD_LOCAL_RANK`` +that is set by Open MPI at launch time and can be retrieved before +``MPI_Init``. An example code sample using the HIP programming model +looks as follows: + +.. code-block:: c + + int num_devices; + hipGetDeviceCount(&num_devices); + assert (num_devices > 0); + + char* ompi_local_rank = getenv("OMPI_COMM_WORLD_LOCAL_RANK"); + if (nullptr != ompi_local_rank) { + hipSetDevice(atoi(ompi_local_rank) % num_devices); + } + + MPI_Init (&argc, &argv); + ... + + +.. note:: Open MPI currently assumes that an MPI processes is using a + single accelerator device. Certain software stacks might be + able to support multiple GPUs per rank. + + + diff --git a/docs/tuning-apps/accelerators/memkind.rst b/docs/tuning-apps/accelerators/memkind.rst new file mode 100644 index 00000000000..7567d9f7e82 --- /dev/null +++ b/docs/tuning-apps/accelerators/memkind.rst @@ -0,0 +1,64 @@ +Support for Memory-kind Info Objects +==================================== + +`MPI version 4.1. `_ +introduced the notion of memory allocation kinds, which allow an +application to specify what memory types it plans to use, and to query +what memory types are supported by the MPI library in a portable +manner. In addition, the application can place restrictions on certain +objects such as creating a separate communicator for using with +host-memory and a communicator that will be used with device memory +only. This approach allows the MPI library to perform certain +optimizations, such as bypassing checking the memory-type of buffer +pointers. Please refer to the MPI specification as well as the `Memory +Allocation Kinds Side Document +`_ for more +details and examples. + +Open MPI starting from version 6.0.0 supports the following values for the memory allocation kind Info object: + +* mpi +* system +* cuda:device +* cuda:host +* cuda:managed +* level_zero:device +* level_zero:host +* level_zero:shared +* rocm:device +* rocm:host +* rocm:managed + +.. note:: Support for accelerator memory allocation kind info objects + will depend on the accelerator support compiled into Open + MPI. + + +Passing memory-kind info to mpiexec +=================================== + +The following example demonstrates how to pass memory allocation kind +information to Open MPI at application launch: + +.. code:: sh + + # Specify that the application will use system, MPI, and CUDA device memory + shell$ mpiexec --memory-allocation-kinds system,mpi,cuda:device -n 64 ./ + +Asserting usage of memory kind when creating a Communicator +=========================================================== + +The following code-snipplet demonstrates how to assert that a +communicator will only be used for ROCm device buffers: + +.. code:: c + + MPI_Info info_assert; + MPI_Info_create (&info_assert); + char assert_key[] = "mpi_assert_memory_alloc_kinds"; + char assert_value[] = "rocm:device"; + MPI_Info_set (info_assert, assert_key, assert_value); + + MPI_Comm comm_dup + MPI_Comm_dup_with_info (MPI_COMM_WORLD, info_assert, &comm_dup); + ... diff --git a/docs/tuning-apps/accelerators/rocm.rst b/docs/tuning-apps/accelerators/rocm.rst new file mode 100644 index 00000000000..812734088d3 --- /dev/null +++ b/docs/tuning-apps/accelerators/rocm.rst @@ -0,0 +1,269 @@ +ROCm +==== + +ROCm is the name of the software stack used by AMD GPUs. It includes +the ROCm Runtime (ROCr), the HIP programming model, and numerous +numerical and machine learning libraries tuned for the AMD Instinct and Radeon +accelerators. More information can be found at the following +`AMD webpages `_ + + +Building Open MPI with ROCm support +----------------------------------- + +ROCm-aware support means that the MPI library can send and receive +data from AMD GPU device buffers directly. Starting from Open MPI +v6.0.0 ROCm support is available directly within Open MPI for single +node scenarios, and through UCX or libfabric for multi-node scenarios. + + +Compiling Open MPI with ROCm support +------------------------------------ + +Compiling Open MPI with ROCm support requires setting the +``--with-rocm=`` option at configure time: + +.. code-block:: sh + + # Configure Open MPI with ROCm support + shell$ cd ompi + shell$ ./configure --with-rocm=/opt/rocm \ + + + +///////////////////////////////////////////////////////////////////////// + +Checking that Open MPI has been built with ROCm support +------------------------------------------------------- + +Verify that Open MPI has been built with ROCm using the +:ref:`ompi_info(1) ` command: + +.. code-block:: sh + + # Use ompi_info to verify ROCm support in Open MPI + shell$ ./ompi_info | grep "MPI extensions" + MPI extensions: affinity, cuda, ftmpi, rocm + +///////////////////////////////////////////////////////////////////////// + +Runtime querying of ROCm support in Open MPI +-------------------------------------------- + +Querying the availability of ROCm support in Open MPI at runtime is +possible through the memory allocation kind info object, see ::ref::`memkind` +page for details. + +In addition, starting with Open MPI v5.0.0 :ref:`MPIX_Query_rocm_support(3) +` is available as an extension to check +the availability of ROCm support in the library. To use the +function, the code needs to include ``mpi-ext.h``. Note that +``mpi-ext.h`` is an Open MPI specific header file. + + +.. _sm-rocm-options-label: + +///////////////////////////////////////////////////////////////////////// + +Running single node jobs with ROCm support +------------------------------------------ + +The user has multiple options for running an Open MPI job with GPU support +in a single node scenario: + +* the default shared memory component ``btl/sm`` has support for + accelerators, will use however by default a bounce buffer on the CPU + for data transfers. Hence, while this works, it will not be able to + take advantage of the high-speed GPU-to-GPU InfinityFabric + interconnect (if available). + +* to use the high-speed GPU-to-GPU interconnect within a node, the user has to + enable the accelerator single-copy component (``smsc/accelerator``), e.g.: + +.. code-block:: sh + + # Enable the smsc/accelerator component + shell$ mpirun --mca smsc_accelerator_priority 80 -n 64 ./ + +* Alternatively, the user can replace the default shared memory + component ``btl/sm`` with the ``btl/smcuda`` component, which has + been extended to support ROCm devices. While this approach supports + communication over a high-speed GPU-to-GPU interconnect, it does not + support single-copy data transfers for host-memory through + e.g. ``xpmem`` or ``cma``. Hence, the performance of host-memory + based data transfers might be lower than with the default ``btl/sm`` + component. Example: + +.. code-block:: sh + + # Use btl/smcuda instead of btl/sm for communication + shell$ mpirun --mca btl smcuda,tcp,self -n 64 ./ + +///////////////////////////////////////////////////////////////////////// + +ROCm support in Open MPI with UCX +--------------------------------- + +In this configuration, UCX will provide the ROCm support, and hence it +is important to ensure that UCX itself is built with ROCm support. Both, +inter- and intra-node communication will be executed through UCX. + +To see if your UCX library was built with ROCm support, run the +following command: + +.. code-block:: sh + + # Check if ucx was built with ROCm support + shell$ ucx_info -v + + # configured with: --with-rocm=/opt/rocm --enable-mt + +If you need to build the UCX library yourself to include ROCm support, +please see the UCX documentation for `building UCX with Open MPI: +`_ + +It should look something like: + +.. code-block:: sh + + # Configure UCX with ROCm support + shell$ cd ucx + shell$ ./configure --prefix=/path/to/ucx-rocm-install \ + --with-rocm=/opt/rocm + + # Configure Open MPI with UCX and ROCm support + shell$ cd ompi + shell$ ./configure --with-rocm=/opt/rocm \ + --with-ucx=/path/to/ucx-rocm-install \ + + +///////////////////////////////////////////////////////////////////////// + +Using ROCm-aware UCX with Open MPI +---------------------------------- + +If UCX and Open MPI have been configured with ROCm support, specifying +the UCX pml component is sufficient to take advantage of the ROCm +support in the libraries. For example, the command to execute the +``osu_latency`` benchmark from the `OSU benchmarks +`_ with ROCm buffers +using Open MPI and UCX ROCm support is something like this: + +.. code-block:: sh + + shell$ mpirun -n 2 --mca pml ucx \ + ./osu_latency D D + +.. note:: some additional configure flags are required to compile the + OSU benchmark to support ROCm buffers. Please refer to the + `UCX ROCm instructions + `_ + for details. + +///////////////////////////////////////////////////////////////////////// + +ROCm support in Open MPI with libfabric +--------------------------------------- + +Some network interconnects are supported through the libfabric library. +Configuring libfabric and Open MPI with ROCm support looks something like: + +.. code-block:: sh + + # Configure libfabric with ROCm support + shell$ cd libfabric + shell$ ./configure --prefix=/path/to/ofi-rocm-install \ + --with-rocr=/opt/rocm + + # Configure Open MPI with libfabric and ROCm support + shell$ cd ompi + shell$ ./configure --with-rocm=/opt/rocm \ + --with-ofi=/path/to/ofi-rocm-install \ + + +///////////////////////////////////////////////////////////////////////// + + +Using ROCm-aware libfabric with Open MPI +---------------------------------------- + +There are two mechanism for using libfabric and Open MPI with ROCm support. + +* Specifying the ``mtl/ofi`` component is sufficient to take advantage + of the ROCm support in the libraries. In this case, both intra- and + inter-node communication will be performed by the libfabric library. In + order to ensure that the application will make use of the shared + memory provider for intra-node communication and the network + interconnect specific provider for inter-node communication, the + user might have to request using the ``linkX`` provider, e.g.: + +.. code-block:: sh + + # Force using the ofi mtl component + mpirun --mca pml cm --mca mtl ofi \ + --mca opal_common_ofi_provider_include "shm+cxi:lnx" \ + -n 64 ./ + +* Alternatively, the user can use the ``btl/ofi`` component, in which + case the intra-node communication will use the Open MPI shared + memory mechanisms (see <_sm-rocm-options-label>), and use + libfabric only for inter-node scenarios. + +.. code-block:: sh + + # Use the ofi btl for inter-node and sm btl + # for intra-node communication + mpirun --mca pml ob1 --mca btl ofi,sm,tcp,self \ + --mca smsc_accelerator_priority 80 \ + -n 64 ./ + + +///////////////////////////////////////////////////////////////////////// + +Collective component supporting ROCm device memory +-------------------------------------------------- + + +The ``coll/accelerator`` component supports collective operations on +ROCm device buffers for many commonly used collective +operations. The component works by copying data into a temporary host +buffer, executing the collective operation on the host buffer, and +copying the result back to the device buffer at completion. This +component will lead to adequate performance for short to medium data +sizes, but performance is often suboptimal especially for large reduction +operations. + +The `UCC `_ based collective component +in Open MPI can be configured and compiled to include ROCm support, +and will typically lead to significantly better performance for large +reductions. + +An example for configure UCC and Open MPI with ROCm is shown below: + +.. code-block:: sh + + # Configure and compile UCC with ROCm support + shell$ cd ucc + shell$ ./configure --with-rocm=/opt/rocm \ + --with-ucx=/path/to/ucx-rocm-install \ + --prefix=/path/to/ucc-rocm-install + shell$ make -j && make install + + # Configure and compile Open MPI with UCX, UCC, and ROCm support + shell$ cd ompi + shell$ ./configure --with-rocm=/opt/rocm \ + --with-ucx=/path/to/ucx-rocm-install \ + --with-ucc=/path/to/ucc-rocm-install + +To use the UCC component in an applicatin requires setting some +additional parameters: + +.. code-block:: + + shell$ mpirun --mca pml ucx --mca osc ucx \ + --mca coll_ucc_enable 1 \ + --mca coll_ucc_priority 100 -np 64 ./my_mpi_app + +.. note:: Using the UCC library for collective operations in Open MPI + requires using the UCX library, and hence cannot be deployed + e.g. when using libfabric. diff --git a/docs/tuning-apps/coll-tuned.rst b/docs/tuning-apps/coll-tuned.rst index 1d5549256d8..b71f4d694ef 100644 --- a/docs/tuning-apps/coll-tuned.rst +++ b/docs/tuning-apps/coll-tuned.rst @@ -3,7 +3,7 @@ Tuning Collectives Open MPI's ``coll`` framework provides a number of components implementing collective communication, including: ``han``, ``libnbc``, ``self``, ``ucc`` ``base``, -``sync``, ``xhc``, ``accelerator``, ``basic``, ``ftagree``, ``inter``, ``portals4``, +``sync``, ``xhc``, ``accelerator``, ``basic``, ``ftagree``, ``inter``, ``portals4``, ``acoll``, and ``tuned``. Some of these components may not be available depending on how Open MPI was compiled and what hardware is available on the system. A run-time decision based on each component's self reported priority, selects which diff --git a/docs/tuning-apps/index.rst b/docs/tuning-apps/index.rst index debc86a0e5e..4d77f176e52 100644 --- a/docs/tuning-apps/index.rst +++ b/docs/tuning-apps/index.rst @@ -9,6 +9,7 @@ components that can be tuned to affect behavior at run time. environment-var networking/index + accelerators/index multithreaded dynamic-loading fork-system-popen diff --git a/docs/tuning-apps/mpi-io.rst b/docs/tuning-apps/mpi-io.rst index ddb84d62874..d478536458c 100644 --- a/docs/tuning-apps/mpi-io.rst +++ b/docs/tuning-apps/mpi-io.rst @@ -1,5 +1,5 @@ -Open MPI IO ("OMPIO") -===================== +MPI IO +====== OMPIO is an Open MPI-native implementation of the MPI I/O functions defined in the MPI specification. @@ -23,7 +23,7 @@ OMPIO is fundamentally a component of the ``io`` framework in Open MPI. Upon opening a file, the OMPIO component initializes a number of sub-frameworks and their components, namely: -* ``fs``: responsible for all file management operations +* ``fs``: responsible for all file management operations * ``fbtl``: support for blocking and non-blocking individual I/O operations * ``fcoll``: support for blocking and non-blocking collective I/O @@ -70,8 +70,7 @@ mechanism available in Open MPI to influence a parameter value, e.g.: shell$ mpirun --mca fcoll dynamic -n 64 ./a.out ``fs`` and ``fbtl`` components are typically chosen based on the file -system type utilized (e.g. the ``pvfs2`` component is chosen when the -file is located on an PVFS2/OrangeFS file system, the ``lustre`` +system type utilized (e.g. the ``lustre`` component is chosen for Lustre file systems, etc.). The ``ufs`` ``fs`` component is used if no file system specific component is availabe (e.g. local file systems, NFS, BeefFS, etc.), and the ``posix`` @@ -154,21 +153,11 @@ operation are listed below: Setting stripe size and stripe width on parallel file systems ------------------------------------------------------------- -Many ``fs`` components allow you to manipulate the layout of a new +Some ``fs`` components allow you to manipulate the layout of a new file on a parallel file system. Note, that many file systems only allow changing these setting upon file creation, i.e. modifying these values for an already existing file might not be possible. -#. ``fs_pvfs2_stripe_size``: Sets the number of storage servers for a - new file on a PVFS2/OrangeFS file system. If not set, system default will be - used. Note that this parameter can also be set through the - ``stripe_size`` MPI Info value. - -#. ``fs_pvfs2_stripe_width``: Sets the size of an individual block for - a new file on a PVFS2 file system. If not set, system default will - be used. Note that this parameter can also be set through the - ``stripe_width`` MPI Info value. - #. ``fs_lustre_stripe_size``: Sets the number of storage servers for a new file on a Lustre file system. If not set, system default will be used. Note that this parameter can also be set through the @@ -193,6 +182,12 @@ significant influence on the performance of the file I/O operation from device buffers, and can be controlled using the ``io_ompio_pipeline_buffer_size`` MCA parameter. +Furthermore, some collective file I/O components such as +``fcoll/vulcan`` allow the user to influence whether the buffer used +for collective aggregation is located in host or device memory through +the ``io_ompio_use_accelerator_buffers`` MCA parameter. + + .. _label-ompio-individual-sharedfp: Using the ``individual`` ``sharedfp`` component and its limitations diff --git a/docs/tuning-apps/networking/index.rst b/docs/tuning-apps/networking/index.rst index 00aa0f39df5..2be844cb61a 100644 --- a/docs/tuning-apps/networking/index.rst +++ b/docs/tuning-apps/networking/index.rst @@ -24,5 +24,3 @@ build support for that library). shared-memory ib-and-roce iwarp - cuda - rocm diff --git a/docs/tuning-apps/networking/rocm.rst b/docs/tuning-apps/networking/rocm.rst deleted file mode 100644 index 10ee12fe9e2..00000000000 --- a/docs/tuning-apps/networking/rocm.rst +++ /dev/null @@ -1,134 +0,0 @@ -ROCm -==== - -ROCm is the name of the software stack used by AMD GPUs. It includes -the ROCm Runtime (ROCr), the HIP programming model, and numerous -numerical and machine learning libraries tuned for the AMD Instinct -accelerators. More information can be found at the following -`AMD webpages `_ - - -Building Open MPI with ROCm support ------------------------------------ - -ROCm-aware support means that the MPI library can send and receive -data from AMD GPU device buffers directly. As of today, ROCm support -is available through UCX. While other communication transports might -work as well, UCX is the only transport formally supported in Open MPI -|ompi_ver| for ROCm devices. - -Since UCX will be providing the ROCm support, it is important to -ensure that UCX itself is built with ROCm support. - -To see if your UCX library was built with ROCm support, run the -following command: - -.. code-block:: sh - - # Check if ucx was built with ROCm support - shell$ ucx_info -v - - # configured with: --with-rocm=/opt/rocm --without-knem --without-cuda - -If you need to build the UCX library yourself to include ROCm support, -please see the UCX documentation for `building UCX with Open MPI: -`_ - -It should look something like: - -.. code-block:: sh - - # Configure UCX with ROCm support - shell$ cd ucx - shell$ ./configure --prefix=/path/to/ucx-rocm-install \ - --with-rocm=/opt/rocm --without-knem - - # Configure Open MPI with UCX and ROCm support - shell$ cd ompi - shell$ ./configure --with-rocm=/opt/rocm \ - --with-ucx=/path/to/ucx-rocm-install \ - - -///////////////////////////////////////////////////////////////////////// - -Checking that Open MPI has been built with ROCm support -------------------------------------------------------- - -Verify that Open MPI has been built with ROCm using the -:ref:`ompi_info(1) ` command: - -.. code-block:: sh - - # Use ompi_info to verify ROCm support in Open MPI - shell$ ./ompi_info | grep "MPI extensions" - MPI extensions: affinity, cuda, ftmpi, rocm - -///////////////////////////////////////////////////////////////////////// - - -Using ROCm-aware UCX with Open MPI --------------------------------------------------------------------------- - -If UCX and Open MPI have been configured with ROCm support, specifying -the UCX pml component is sufficient to take advantage of the ROCm -support in the libraries. For example, the command to execute the -``osu_latency`` benchmark from the `OSU benchmarks -`_ with ROCm buffers -using Open MPI and UCX ROCm support is something like this: - -.. code-block:: - - shell$ mpirun -n 2 --mca pml ucx \ - ./osu_latency D D - -Note: some additional configure flags are required to compile the OSU -benchmark to support ROCm buffers. Please refer to the `UCX ROCm -instructions -`_ -for details. - - -///////////////////////////////////////////////////////////////////////// - -Runtime querying of ROCm support in Open MPI --------------------------------------------- - -Starting with Open MPI v5.0.0 :ref:`MPIX_Query_rocm_support(3) -` is available as an extension to check -the availability of ROCm support in the library. To use the -function, the code needs to include ``mpi-ext.h``. Note that -``mpi-ext.h`` is an Open MPI specific header file. - -///////////////////////////////////////////////////////////////////////// - -Collective component supporting ROCm device memory --------------------------------------------------- - -The `UCC `_ based collective component -in Open MPI can be configured and compiled to include ROCm support. - -An example for configure UCC and Open MPI with ROCm is shown below: - -.. code-block:: - - # Configure and compile UCC with ROCm support - shell$ cd ucc - shell$ ./configure --with-rocm=/opt/rocm \ - --with-ucx=/path/to/ucx-rocm-install \ - --prefix=/path/to/ucc-rocm-install - shell$ make -j && make install - - # Configure and compile Open MPI with UCX, UCC, and ROCm support - shell$ cd ompi - shell$ ./configure --with-rocm=/opt/rocm \ - --with-ucx=/path/to/ucx-rocm-install \ - --with-ucc=/path/to/ucc-rocm-install - -To use the UCC component in an applicatin requires setting some -additional parameters: - -.. code-block:: - - shell$ mpirun --mca pml ucx --mca osc ucx \ - --mca coll_ucc_enable 1 \ - --mca coll_ucc_priority 100 -np 64 ./my_mpi_app diff --git a/docs/tuning-apps/networking/shared-memory.rst b/docs/tuning-apps/networking/shared-memory.rst index 7c40693cd76..0584c554e4f 100644 --- a/docs/tuning-apps/networking/shared-memory.rst +++ b/docs/tuning-apps/networking/shared-memory.rst @@ -13,7 +13,7 @@ can only be used between processes executing on the same node. BTL was named ``vader``. As of Open MPI version 5.0.0, the BTL has been renamed ``sm``. -.. warning:: In Open MPI version 5.0.x, the name ``vader`` is simply +.. warning:: In Open MPI version 6.0.x, the name ``vader`` is simply an alias for the ``sm`` BTL. Similarly, all ``vader_``-prefixed MCA parameters are automatically aliased to their corresponding ``sm_``-prefixed MCA @@ -90,7 +90,7 @@ The ``sm`` BTL supports two modes of shared memory communication: #. **Single copy:** In this mode, the sender or receiver makes a single copy of the message data from the source buffer in one process to the destination buffer in another process. Open MPI - supports three flavors of shared memory single-copy transfers: + supports four flavors of shared memory single-copy transfers: * `Linux KNEM `_. This is a standalone Linux kernel module, made specifically for HPC and MPI @@ -118,6 +118,18 @@ The ``sm`` BTL supports two modes of shared memory communication: Open MPI must be built on a Linux system with a recent enough Glibc and kernel version in order to build support for Linux CMA. + * Accelerator IPC mechanism: some accelerator devices support + direct GPU-to-GPU data transfers that can take advantage of + high-speed interconnects between the accelerators. This component + is based on IPC abstractions introduced in the accelerator + framework, which allows the sm btl component to use this + mechanism if requested by the user. For host memory this + component will pass through the operation to another single-copy + component. + + The component is disabled by default. To use this component, the + application has to increase the priority of the component. + Which mechanism is used at run time depends both on how Open MPI was built and how your system is configured. You can check to see which single-copy mechanisms Open MPI was built with via two mechanisms: From bbce4b57b5140b50095e72a1961563f727f1d4eb Mon Sep 17 00:00:00 2001 From: Roie Danino Date: Sun, 14 Dec 2025 11:03:21 +0200 Subject: [PATCH 43/51] OSHMEM/SHMEM/C: shmem_signal_fetch implementation Signed-off-by: Roie Danino --- oshmem/shmem/c/shmem_put_signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oshmem/shmem/c/shmem_put_signal.c b/oshmem/shmem/c/shmem_put_signal.c index 4c50dc10ede..42b2840fddd 100644 --- a/oshmem/shmem/c/shmem_put_signal.c +++ b/oshmem/shmem/c/shmem_put_signal.c @@ -225,6 +225,6 @@ SHMEM_TYPE_PUTMEM_SIGNAL(_put128_signal, 16, shmem) uint64_t shmem_signal_fetch(const uint64_t *sig_addr) { - return OSHMEM_ERR_NOT_IMPLEMENTED; + return shmem_uint64_atomic_fetch(sig_addr, pshmem_my_pe()); } From a0593adfd520eb69347ac015e56f3f865ef643e7 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Mon, 15 Dec 2025 15:21:35 -0600 Subject: [PATCH 44/51] sharedfp/lockedfile: do not limit filename length The sharedfp/lockedfile component had a temp. buffer of size 256 characters for generating a temp. file for testing the file locking feature during file open. For very long filenames, that was not sufficient. Switch to an asprintf based method instead of a fixed namelen to resolve the issue. Fixes issue #13565 Signed-off-by: Edgar Gabriel --- ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile.c b/ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile.c index bfbc940ae11..3b6e9f38682 100644 --- a/ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile.c +++ b/ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile.c @@ -79,8 +79,7 @@ int mca_sharedfp_lockedfile_component_init_query(bool enable_progress_threads, struct mca_sharedfp_base_module_2_0_0_t * mca_sharedfp_lockedfile_component_file_query(ompio_file_t *fh, int *priority) { struct flock lock; int fd, err; - /*char *filename;*/ - char filename[256]; + char *filename; int rank; bool has_file_lock_support=false; @@ -117,7 +116,7 @@ struct mca_sharedfp_base_module_2_0_0_t * mca_sharedfp_lockedfile_component_file /* Set the filename. */ /*data filename created by appending .locktest.$rank to the original filename*/ - snprintf(filename, sizeof(filename), "%s%s%d",fh->f_filename,".locktest.",rank); + opal_asprintf(&filename, "%s.locktest.%d",fh->f_filename, rank); lock.l_type = F_WRLCK; lock.l_start = 0; @@ -160,6 +159,7 @@ struct mca_sharedfp_base_module_2_0_0_t * mca_sharedfp_lockedfile_component_file close(fd); unlink( filename ); } + free(filename); /**priority=100;*/ if(has_file_lock_support){ return &lockedfile; From d53583605b68574f606622405f20d5d9de4d61ed Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Tue, 16 Dec 2025 11:19:28 -0700 Subject: [PATCH 45/51] docs: cleanup some hpe/crayisms that were scattered around in the docs. Note that there are some places of historical interest where the ALPS nomenclature was retained. Signed-off-by: Howard Pritchard --- docs/features/ulfm.rst | 4 ++-- docs/installing-open-mpi/configure-cli-options/runtime.rst | 6 +++--- docs/release-notes/platform.rst | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/features/ulfm.rst b/docs/features/ulfm.rst index 6cb0acdd006..0ec3570c927 100644 --- a/docs/features/ulfm.rst +++ b/docs/features/ulfm.rst @@ -87,7 +87,7 @@ non-blocking) use an optimized implementation on top of ``ob1``. - Loopback (send-to-self) - TCP - UCT (InfiniBand) -- uGNI (Cray Gemini, Aries) +- OFI/libfabric - Shared Memory (FT supported with CMA and XPMEM; KNEM is untested) - Tuned and non-blocking collective communications @@ -159,7 +159,7 @@ Running under a batch scheduler ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ULFM can operate under a job/batch scheduler, and is tested routinely -with ALPS, PBS, and Slurm. One difficulty comes from the fact that +with PBS and Slurm. One difficulty comes from the fact that many job schedulers handle failures by triggering an immediate "cleanup" of the application as soon as any process fails. In addition, failure detection subsystems integrated into PRTE are not active in direct launch diff --git a/docs/installing-open-mpi/configure-cli-options/runtime.rst b/docs/installing-open-mpi/configure-cli-options/runtime.rst index c6e825e63d7..b1f88dca1cd 100644 --- a/docs/installing-open-mpi/configure-cli-options/runtime.rst +++ b/docs/installing-open-mpi/configure-cli-options/runtime.rst @@ -17,9 +17,9 @@ can be used with ``configure``: so that executables such as ``mpicc`` and ``mpirun`` can be found without needing to type long path names. -* ``--with-alps``: - Force the building of for the Cray Alps run-time environment. If - Alps support cannot be found, configure will abort. +* ``--with-pals``: + Force the building of for the Cray PALS run-time environment. If + PALS support cannot be found, configure will abort. * ``--with-lsf=DIR``: Specify the directory where the LSF libraries and header files are diff --git a/docs/release-notes/platform.rst b/docs/release-notes/platform.rst index f576da867c9..ec6ead13598 100644 --- a/docs/release-notes/platform.rst +++ b/docs/release-notes/platform.rst @@ -51,5 +51,5 @@ that a release of Open MPI supports. * PBS Pro, Torque * Platform LSF (tested with v9.1.1 and later) * Slurm - * Cray XE, XC, and XK + * HPE/Cray PALS * Oracle Grid Engine (OGE) 6.1, 6.2 and open source Grid Engine From 355d5ce04b86d3c357ee28da315f3bf9e064cb1a Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Mon, 15 Dec 2025 16:04:23 -0700 Subject: [PATCH 46/51] MPI_F08: error out if fortran FCFLAGs do not work with big count Without this patch, one can successfully configure OMPI with fortran compiler flags that lead to compilation failures when building the mpi_f08 module. This can happen when the default integer size is the same as KIND=MPI_KIND_COUNT. Add a configure check to error out at that state with a hopefully useful error message: checking if Fortran compiler can compile interface containing MPI_KIND_COUNT arguments ... no configure: error: The MPI_F08 module cannot be compiled using configure Fortran commpiler options. Either change compiler options or use --enable-mpi-fortran=usempi Signed-off-by: Howard Pritchard --- config/ompi_fortran_check_big_count.m4 | 54 ++++++++++++++++++++++++++ configure.ac | 8 ++++ 2 files changed, 62 insertions(+) create mode 100644 config/ompi_fortran_check_big_count.m4 diff --git a/config/ompi_fortran_check_big_count.m4 b/config/ompi_fortran_check_big_count.m4 new file mode 100644 index 00000000000..99ca2dbbc11 --- /dev/null +++ b/config/ompi_fortran_check_big_count.m4 @@ -0,0 +1,54 @@ +dnl -*- shell-script -*- +dnl +dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +dnl University Research and Technology +dnl Corporation. All rights reserved. +dnl Copyright (c) 2004-2005 The University of Tennessee and The University +dnl of Tennessee Research Foundation. All rights +dnl reserved. +dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +dnl University of Stuttgart. All rights reserved. +dnl Copyright (c) 2004-2005 The Regents of the University of California. +dnl All rights reserved. +dnl Copyright (c) 2010-2014 Cisco Systems, Inc. All rights reserved. +dnl Copyright (c) 2022-2025 Triad National Security, LLC. All rights +dnl reserved. +dnl $COPYRIGHT$ +dnl +dnl Additional copyrights may follow +dnl +dnl $HEADER$ +dnl + +# Check whether or not the Fortran compiler can build mpi_f08 using +# the given compiler options. This test is intended to +# trap cases where default INTEGER KIND is equivalent to MPI_COUNT_KIND. + +# OMPI_FORTRAN_CHECK_BIG_COUNT([action if found], +# [action if not found]) +# ---------------------------------------------------- +AC_DEFUN([OMPI_FORTRAN_CHECK_BIG_COUNT],[ + AS_VAR_PUSHDEF([big_count_var], [ompi_cv_big_count_var]) + + AC_CACHE_CHECK([if Fortran compiler can compile interface containing MPI_KIND_COUNT arguments ], big_count_var, + [AC_LANG_PUSH([Fortran]) + AC_COMPILE_IFELSE([AC_LANG_SOURCE([[module sendit_interfaces +INTERFACE SendIt + subroutine SendIt_int(X) + INTEGER :: X + end subroutine SendIt_int + subroutine SendIt_big(x) + integer(KIND=$OMPI_MPI_COUNT_KIND)::X + end subroutine SendIt_big + end interface SendIt + end module sendit_interfaces]])], + [AS_VAR_SET(big_count_var, yes)], + [AS_VAR_SET(big_count_var, no)]) + touch conftest_foo.mod + rm -rf *.mod 2>/dev/null + AC_LANG_POP([Fortran]) + ]) + + AS_VAR_IF(big_count_var, [yes], [$1], [$2]) + AS_VAR_POPDEF([big_count_var])dnl +]) diff --git a/configure.ac b/configure.ac index 584657a6811..d5fc7325933 100644 --- a/configure.ac +++ b/configure.ac @@ -896,6 +896,14 @@ AC_CHECK_MEMBERS([struct timespec.tv_nsec], # m4_ifdef([project_ompi], [OMPI_FIND_MPI_AINT_COUNT_OFFSET]) +# +# now that we have MPI_COUNT_KIND set, try building +# MPI F08 bindings if that level of fortran support is requested +# +m4_ifdef([project_ompi], [AS_IF([test $OMPI_BUILD_FORTRAN_BINDINGS -ge $OMPI_FORTRAN_USEMPIF08_BINDINGS], + [OMPI_FORTRAN_CHECK_BIG_COUNT([], + [AC_MSG_ERROR([The mpi_f08 module cannot be compiled using the current set of Fortran compiler options ($FCFLAGS). Either change compiler options or use --enable-mpi-fortran=usempi])]) + ])]) # checkpoint results AC_CACHE_SAVE From b9ee2d39eebedceaa03766395aa919d27ebc6fe4 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Tue, 16 Dec 2025 12:29:00 -0700 Subject: [PATCH 47/51] docs: generalize a couple of places for current release rather than hardwired to 5.0.(0,x). Signed-off-by: Howard Pritchard --- docs/building-apps/abi-compatibility.rst | 2 +- docs/mca.rst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/building-apps/abi-compatibility.rst b/docs/building-apps/abi-compatibility.rst index ab9272241bd..458362ca0ef 100644 --- a/docs/building-apps/abi-compatibility.rst +++ b/docs/building-apps/abi-compatibility.rst @@ -11,7 +11,7 @@ application compiled with Open MPI v4.x can be executed with Open MPI .. important:: ABI is maintained for *most* of the Fortran MPI bindings, too |mdash| see below for additional information. There are however a few scenarios where an application compiled with -Open MPI v4.x might not execute correctly with Open MPI 5.0. +Open MPI v4.x might not execute correctly with Open MPI |ompi_series|. - Fortran compilers provide varying degrees of ABI guarantees between their releases. As such, Open MPI can only provide ABI guarantees diff --git a/docs/mca.rst b/docs/mca.rst index 55063b5c29e..a30c763320a 100644 --- a/docs/mca.rst +++ b/docs/mca.rst @@ -661,8 +661,8 @@ presented here so that they can easily be found via internet searches: .. _label-mca-backward-compat: -MCA Parameter Changes Between Open MPI 4.x and 5.x --------------------------------------------------- +MCA Parameter Changes Between Open MPI 4.x and newer releases +------------------------------------------------------------- When Open MPI :ref:`switched from using ORTE to PRRTE as its run-time environment, ` some MCA From 7d78a4d08eb112a57f87859aa901446195344ccb Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Tue, 16 Dec 2025 13:02:42 -0700 Subject: [PATCH 48/51] docs: tweaks to developer docs for 6.0.x Signed-off-by: Howard Pritchard --- docs/developers/gnu-autotools.rst | 7 +++++++ docs/developers/prerequisites.rst | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/developers/gnu-autotools.rst b/docs/developers/gnu-autotools.rst index c6a9c96ca56..f3187a89e16 100644 --- a/docs/developers/gnu-autotools.rst +++ b/docs/developers/gnu-autotools.rst @@ -186,6 +186,13 @@ to at least the versions listed below. - 2.4.6 - 2.5.35 - 4.2.0 + * - v6.0.x + - 1.4.17 + - 2.69 + - 1.15 + - 2.4.6 + - 2.5.35 + - 4.2.0 * - Git main - 1.4.17 - 2.69 diff --git a/docs/developers/prerequisites.rst b/docs/developers/prerequisites.rst index 3d3235df80b..b4971635b52 100644 --- a/docs/developers/prerequisites.rst +++ b/docs/developers/prerequisites.rst @@ -59,7 +59,7 @@ the Open MPI build, such as (but not limited to): * When building from a Git clone: - * Generating the Fortran bindings + * Generating the C and Fortran bindings * Generating the "show help" messages From a88da05ea43c72df360319f1f1f59abb49d56106 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Tue, 16 Dec 2025 13:24:04 -0700 Subject: [PATCH 49/51] docs: add note about F08 big count and FCFLAGS related to #13572 Signed-off-by: Howard Pritchard --- docs/installing-open-mpi/compilers-and-flags.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/installing-open-mpi/compilers-and-flags.rst b/docs/installing-open-mpi/compilers-and-flags.rst index eaca8f1a569..7691cf1ff12 100644 --- a/docs/installing-open-mpi/compilers-and-flags.rst +++ b/docs/installing-open-mpi/compilers-and-flags.rst @@ -130,6 +130,14 @@ compatible, meaning that object files created by one compiler must be able to be linked with object files from the other compilers and produce correctly functioning executables. +.. note:: Starting with Open MPI 6.0.0, the MPI 4.0 large count + C and Fortran ``mpi_f08`` interfaces are supported. A consequence + of this is that users may specify Fortran compiler + options that conflict with the support for F08 interfaces that have + both default ``INTEGER`` and ``INTEGER(KIND=MPI_COUNT_KIND)`` + procedures. The Open MPI ``configure`` script will detect this condition + and fail with an appropriate error message. + Statically linking to the libraries of Intel compiler suite ----------------------------------------------------------- From ee1aac8d9a0860009af4ba5ea6a0bd6b8d765ded Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Wed, 17 Dec 2025 22:45:19 +0000 Subject: [PATCH 50/51] ACCL/ZE: fix a missing symbols problem and squash some compiler warnings intel one-api cc was emitting Signed-off-by: Howard Pritchard --- .../accelerator/ze/accelerator_ze_module.c | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/opal/mca/accelerator/ze/accelerator_ze_module.c b/opal/mca/accelerator/ze/accelerator_ze_module.c index cb6cff21fef..6541444ae49 100644 --- a/opal/mca/accelerator/ze/accelerator_ze_module.c +++ b/opal/mca/accelerator/ze/accelerator_ze_module.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights reserved. - * Copyright (c) 2023-2024 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2023-2025 Triad National Security, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -453,8 +453,7 @@ static int mca_accelerator_ze_memcpy_async(int dest_dev_id, int src_dev_id, void ze_result_t zret; opal_accelerator_ze_stream_t *ze_stream = NULL; - if (NULL == stream || NULL == src || - NULL == dest || size < 0) { + if (NULL == stream || NULL == src || NULL == dest) { return OPAL_ERR_BAD_PARAM; } if (0 == size) { @@ -489,7 +488,7 @@ static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest opal_accelerator_ze_stream_t *ze_stream = NULL; - if (NULL == src || NULL == dest || size <0) { + if (NULL == src || NULL == dest) { return OPAL_ERR_BAD_PARAM; } if (0 == size) { @@ -581,6 +580,14 @@ static int mca_accelerator_ze_memmove_async(int dest_dev_id, int src_dev_id, voi return OPAL_ERR_NOT_IMPLEMENTED; } +static int mca_accelerator_ze_sync_stream(opal_accelerator_stream_t *stream) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} + static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size) { ze_result_t zret; @@ -852,15 +859,6 @@ static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_a return OPAL_SUCCESS; } - -static int mca_accelerator_ze_wait_stream(opal_accelerator_stream_t *stream) -{ - /* - * TODO - */ - return OPAL_ERR_NOT_IMPLEMENTED; -} - static int mca_accelerator_ze_get_num_devices(int *num_devices) { /* From ab57fbbe6114ab1834cc3e432d2d93451be9374b Mon Sep 17 00:00:00 2001 From: Joseph Antony Date: Mon, 22 Dec 2025 09:46:28 -0500 Subject: [PATCH 51/51] Edits for Public APIs: put_with_notify get_with_notify Signed-off-by: Joseph Antony --- ompi/mca/osc/sm/osc_sm.h | 1 - ompi/runtime/ompi_spc.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ompi/mca/osc/sm/osc_sm.h b/ompi/mca/osc/sm/osc_sm.h index 200ec8b3de8..0aca3b50892 100644 --- a/ompi/mca/osc/sm/osc_sm.h +++ b/ompi/mca/osc/sm/osc_sm.h @@ -107,7 +107,6 @@ int ompi_osc_sm_detach(struct ompi_win_t *win, const void *base); int ompi_osc_sm_free(struct ompi_win_t *win); -// TODO: add put/get_notify prototypes int ompi_osc_sm_put(const void *origin_addr, size_t origin_count, diff --git a/ompi/runtime/ompi_spc.c b/ompi/runtime/ompi_spc.c index 1d25545c80b..dcbbe04b256 100644 --- a/ompi/runtime/ompi_spc.c +++ b/ompi/runtime/ompi_spc.c @@ -74,7 +74,7 @@ static const ompi_spc_event_t ompi_spc_events_desc[OMPI_SPC_NUM_COUNTERS] = { SET_COUNTER_ARRAY(OMPI_SPC_PUT_NOTIFY, "The number of times MPI_Put_notify was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_RPUT, "The number of times MPI_Rput was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_GET, "The number of times MPI_Get was called.", false, false), - SET_COUNTER_ARRAY(OMPI_SPC_GET_NOTIFY, "The number of times MPI_Get was called.", false, false), + SET_COUNTER_ARRAY(OMPI_SPC_GET_NOTIFY, "The number of times MPI_Get_notify was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_RGET, "The number of times MPI_Rget was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_PROBE, "The number of times MPI_Probe was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_IPROBE, "The number of times MPI_Iprobe was called.", false, false),