3.1. NCCL Example¶
Here’s an example created based on the NVIDIA Docs.
// helper.hh
#include <iostream>
#include <mpi.h>
#include <assert.h>
#include <memory>
#include "cuda_runtime.h"
#include "nccl.h"
#include <unistd.h>
#include <stdint.h>
#define MPICHECK(cmd) do { \
int e = cmd; \
if( e != MPI_SUCCESS ) { \
printf("Failed: MPI error %s:%d '%d'\n", \
__FILE__,__LINE__, e); \
exit(EXIT_FAILURE); \
} \
} while(0)
#define CUDACHECK(cmd) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
printf("Failed: Cuda error %s:%d '%s'\n", \
__FILE__,__LINE__,cudaGetErrorString(e)); \
exit(EXIT_FAILURE); \
} \
} while(0)
#define NCCLCHECK(cmd) do { \
ncclResult_t r = cmd; \
if (r!= ncclSuccess) { \
printf("Failed, NCCL error %s:%d '%s'\n", \
__FILE__,__LINE__,ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while(0)
struct MPIH {
int ranks, rank;
MPI_Comm comm;
MPIH(int *argc, char **argv[]) : comm(MPI_COMM_WORLD) {
int provided;
MPICHECK( MPI_Init_thread(argc, argv, MPI_THREAD_FUNNELED, &provided) );
assert(provided >= MPI_THREAD_FUNNELED);
MPICHECK( MPI_Comm_size( comm, &ranks) );
MPICHECK( MPI_Comm_rank( comm, &rank ) );
}
~MPIH() {
MPI_Finalize();
}
};
using MPIp = std::shared_ptr<MPIH>;
struct NCCLH {
MPIp mpi;
ncclUniqueId id;
ncclComm_t comm;
cudaStream_t stream;
NCCLH(MPIp _mpi) : mpi(_mpi) {
int numGPUs;
if (mpi->rank == 0) ncclGetUniqueId(&id);
MPICHECK( MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, mpi->comm) );
CUDACHECK( cudaGetDeviceCount(&numGPUs) );
CUDACHECK( cudaSetDevice(mpi->rank % numGPUs) );
CUDACHECK( cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) );
NCCLCHECK( ncclCommInitRank(&comm, mpi->ranks, id, mpi->rank) );
}
~NCCLH() {
CUDACHECK(cudaStreamDestroy(stream));
ncclCommDestroy(comm);
}
};
using NCCLp = std::shared_ptr<NCCLH>;
It’s always a good idea to wrap up initialization and finalization code inside classes to manage them.
Note that we’re using shared-pointers here, since lots of different
parts of the code might end up storing the NCCL-helper struct above.
Shared pointers act just like pointers (de-reference with *
or ->
).
However, they track reference-counts and destroy the object when it hits zero.
With this out of the way, the main code is simple:
// allreduce.cu
#include "helper.hh"
int size = 8*1024*1024; // 64 MB
int run(NCCLp nccl, const float *send, float *recv) {
NCCLCHECK(ncclAllReduce((const void*)send, (void*)recv, size, ncclFloat, ncclSum,
nccl->comm, nccl->stream));
return 0;
}
int main(int argc, char *argv[]) {
auto mpi = std::make_shared<MPIH>(&argc, &argv);
auto nccl = std::make_shared<NCCLH>(mpi);
float *sendbuff, *recvbuff;
CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float)));
CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float)));
std::cout << "Hello" << std::endl;
run(nccl, sendbuff, recvbuff);
CUDACHECK(cudaStreamSynchronize(nccl->stream));
return 0;
}
Here’s the cmake magic needed to compile it,
# CMakeLists.txt
CMAKE_MINIMUM_REQUIRED(VERSION 3.17)
PROJECT(use_nccl CXX CUDA)
# Dependency Packages
list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
find_package(MPI REQUIRED)
find_package(NCCL REQUIRED)
# Global project properties
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED True)
add_executable(allreduce allreduce.cu)
target_link_libraries(allreduce PUBLIC NCCL MPI::MPI_CXX)
set_property(TARGET allreduce PROPERTY CUDA_ARCHITECTURES 70)
install(TARGETS allreduce DESTINATION bin)
and the biggest file in the distribution,
# Find the nccl libraries
# from https://github.com/xuhdev/pytorch/blob/a3b4accf014e18bf84f58d3018854435cbc3d55b/cmake/Modules/FindNCCL.cmake
#
# The following variables are optionally searched for defaults
# NCCL_ROOT: Base directory where all NCCL components are found
# NCCL_INCLUDE_DIR: Directory where NCCL header is found
# NCCL_LIB_DIR: Directory where NCCL library is found
#
# The following are set after configuration is done:
# NCCL_FOUND
# NCCL_INCLUDE_DIRS
# NCCL_LIBRARIES
#
# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
# install NCCL in the same location as the CUDA toolkit.
# See https://github.com/caffe2/caffe2/issues/1601
set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
list(APPEND NCCL_ROOT ${NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
find_path(NCCL_INCLUDE_DIRS
NAMES nccl.h
HINTS ${NCCL_INCLUDE_DIR})
if (USE_STATIC_NCCL)
MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
SET(NCCL_LIBNAME "nccl_static")
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
else()
SET(NCCL_LIBNAME "nccl")
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
endif()
find_library(NCCL_LIBRARIES
NAMES ${NCCL_LIBNAME}
HINTS ${NCCL_LIB_DIR})
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
if(NCCL_FOUND)
set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
message (STATUS "Determining NCCL version from the header file: ${NCCL_HEADER_FILE}")
file (STRINGS ${NCCL_HEADER_FILE} NCCL_MAJOR_VERSION_DEFINED
REGEX "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1)
if (NCCL_MAJOR_VERSION_DEFINED)
string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+" ""
NCCL_MAJOR_VERSION ${NCCL_MAJOR_VERSION_DEFINED})
message (STATUS "NCCL_MAJOR_VERSION: ${NCCL_MAJOR_VERSION}")
endif ()
message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
# Create a new-style imported target (NCCL)
if (USE_STATIC_NCCL)
add_library(NCCL STATIC IMPORTED)
else()
add_library(NCCL SHARED IMPORTED)
endif ()
set_property(TARGET NCCL PROPERTY
IMPORTED_LOCATION ${NCCL_LIBRARIES})
set_property(TARGET NCCL PROPERTY
LANGUAGE CUDA)
target_include_directories(NCCL INTERFACE ${NCCL_INCLUDE_DIRS})
mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
endif()
I built NCCL from their github source
(using make src.build CUDA_HOME=$CUDA_DIR NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70"
)
and left it in its build directory (nccl/build). Then ran
cmake -DCMAKE_PREFIX_PATH=/path/to/nccl/build ..
. You’ll need
cuda and MPI modules loaded, and MPI build flags enabled.
You can run some quick tests on this using interactive mode,
bsub -nnodes 1 -W 30 -P CHM101 -Is $SHELL
# run 6 ranks per node
jsrun --smpiargs=-gpu -r 6 -g 1 -c 7 -b packed:7 -EOMP_NUM_THREADS=7 ./allreduce
The performance plots below were gathered following the recipe for Making a Scaling Plot.
The “bus” bandwidths reported are computed using the simple formula 2x(bytes per rank in allreduce) / time. This makes them smaller than the true interconnect bandwidth. Times collected were averages over 10 allreduces with no blocking in-between. The first 10 are marked as “cold” and the second 10 are marked as “warm”. NCCL’s first run has a noticeable, but worthwhile initialization cost.
Also, the performance was insensitive to the layout of resource sets:
jsrun --smpiargs=-gpu -n $((6*nodes)) -g1 -c7 -b packed:7
jsrun --smpiargs=-gpu -n $((2*nodes)) -g3 -c21 -a3 -b packed:7
jsrun --smpiargs=-gpu -n $nodes -g6 -c42 -a6 -b packed:7
Contributed by
David M. Rogers