admin管理员组

文章数量:1444674

NCCl API ncclGetUniqueId功能实现,测试和结果验证

测试程序使用的nccl版本和cuda版本

NCCL 版本 : 2.19.3

CUDA版本: cuda_12.4.r12.4

ncclGetUniqueId API实现整体流程图

ncclGetUniqueId API实现流程图

ncclGetUniqueId API源代码解析

ncclGetUniqueId API源代码大致可以分为四个函数:ncclInit,PtrCheck,bootstrapGetUniqueId和hashUniqueId。

结构体 ncclUniqueId 定义

代码语言:c代码运行次数:0运行复制
#define NCCL_UNIQUE_ID_BYTES 128
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;

ncclInit 函数解析

ncclInit函数主要完成env检查和明确目标网卡相关信息。

默认env是NULL,实际中可以赋值完成相关应用。目标网卡信息主要是网卡名和对应地址,先检查ib网卡,在检查普通网卡,使用getifaddrs 系统API找到网卡名和对应地址信息,对应地址信息是ncclUniqueId结构体8-15bit数据,与IP地址相关需要inet_ntop相关API转换,不能直接转换。

ncclNetPluginInit函数查找,打开libnccl-net.so,默认没有相关文件,直接返回成功。initNvtxRegisteredEnums目前理解是实现nccl规约计算相关的初始化(后续有需要详细解析规约计算相关功能)。

PtrCheck 函数解析

check第一个参数地址是否NULL,默认运行时不为NULL,没有其他功能代码。

bootstrapGetUniqueId 函数解析

bootstrapGetUniqueId主要完成获取随机数和 NCCL 初始化阶段创建 Bootstrap 网络的根节点。

使用通过linux kernel /dev/urandom方法获取随机数,是ncclUniqueId的0-7位数据

bootstrapCreateRoot完成创建 Bootstrap 网络的根节点功能,后续有需要在详细解析此功能。

hashUniqueId 函数解析

自定义hash ncclUniqueId结果,仅仅trace输出。

ncclGetUniqueId API运行结果

ncclGetUniqueId API运行结果

ncclGetUniqueId API 例子源代码

代码语言:cu复制
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>

#include <sys/socket.h>
#include <arpa/inet.h>
#include <netinet/tcp.h>
#include <netdb.h>
#include <fcntl.h>
#include <poll.h>

#include "cuda_runtime.h"
#include "nccl.h"

#define DEFAULT_ARGC      (2)
char get_gpu_number_cmd[] = "nvidia-smi topo -m";

#if 1
#define CUDACHECK(cmd) do {                         \
  cudaError_t err = cmd;                            \
  if (err != cudaSuccess) {                         \
    printf("Failed: Cuda error %s:%d '%s'\n",       \
        __FILE__,__LINE__,cudaGetErrorString(err)); \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)

#define NCCLCHECK(cmd) do {                         \
  ncclResult_t res = cmd;                           \
  if (res != ncclSuccess) {                         \
    printf("Failed, NCCL error %s:%d '%s'\n",       \
        __FILE__,__LINE__,ncclGetErrorString(res)); \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)
#endif

void show_h_A_value(int *h_A, int cnt)
{
    for (int i = 0; i < cnt; ++i) {
        printf("%d ", (int)h_A[i]);
    }
    printf("\n");
}

int main(int argc, char* argv[])
{
  ncclComm_t comms[4];

  //managing 4 devices
  int nDev = 1;
  int devs[4] = { 0, 1, 2, 3 };

  printf("=====================start==============\n");
  printf("[%s +%d %s] ncclGetVersion/ncclGetUniqueId/ncclCommInitAll API test example\n", __FILE__, __LINE__, __func__);

  if(DEFAULT_ARGC != argc)
  {
    printf("[%s +%d %s] argc=%d !=%d so exit(0)\n", __FILE__, __LINE__, __func__, argc, DEFAULT_ARGC);
    printf("[%s +%d %s] example ./nccl_inits.elf num is the number can used \n", __FILE__, __LINE__, __func__);
    printf("./nccl_inits.elf 1 \n");
    printf("./nccl_inits.elf 2 \n");
    printf("./nccl_inits.elf 4 \n");
	  system(get_gpu_number_cmd);
    exit(0);
  }
  else
  {
    nDev = atoi(argv[1]);
    printf("[%s +%d %s] input nDev=%d\n", __FILE__, __LINE__, __func__, nDev);
  }

  ncclUniqueId uniqueId;
  /* Common socket address storage structure for IPv4/IPv6 */
  union ncclSocketAddress {
    struct sockaddr sa;
    struct sockaddr_in sin;
    struct sockaddr_in6 sin6;
  };
  struct ncclBootstrapHandle {
    uint64_t magic;
    union ncclSocketAddress addr;
  };
  struct ncclBootstrapHandle handle;
  uint64_t magic;
  #define MAX_BUSIDSTR_SIZE   (13)
  char busIdStr[MAX_BUSIDSTR_SIZE] = "";
  int driverVersion;
  CUDACHECK(cudaDriverGetVersion(&driverVersion));
  printf("[%s +%d %s] driverVersion=%d\n", __FILE__, __LINE__, __func__, driverVersion);

  int runtimeVersion;
  CUDACHECK(cudaRuntimeGetVersion(&runtimeVersion));
  printf("[%s +%d %s] runtimeVersion=%d\n", __FILE__, __LINE__, __func__, runtimeVersion);


  for (int i = 0; i < nDev; ++i)
  {
    CUDACHECK(cudaSetDevice(i));
    int nccl_version = 0;
    printf("[%s +%d %s] call ncclGetVersion\n", __FILE__, __LINE__, __func__);
    NCCLCHECK(ncclGetVersion(&nccl_version));
    printf("[%s +%d %s] nccl_version=%d\n", __FILE__, __LINE__, __func__, nccl_version);

    printf("[%s +%d %s] call ncclGetUniqueId\n", __FILE__, __LINE__, __func__);
    NCCLCHECK(ncclGetUniqueId(&uniqueId));
    memcpy(&handle, uniqueId.internal, sizeof(struct ncclBootstrapHandle));
    memcpy(&magic, uniqueId.internal, sizeof(uint64_t));
    printf("[%s +%d %s] [%d] memcpy(&magic, uniqueId.internal, sizeof(uint64_t))      magic=0x%lx\n", __FILE__, __LINE__, __func__, i, magic);
    memcpy(&magic, uniqueId.internal + 8, sizeof(uint64_t));
    printf("[%s +%d %s] [%d] memcpy(&magic, uniqueId.internal + 8, sizeof(uint64_t)); magic=0x%lx\n", __FILE__, __LINE__, __func__, i, magic);
    printf("[%s +%d %s] MAX_BUSIDSTR_SIZE=%d sizeof(busIdStr)=%ld\n", __FILE__, __LINE__, __func__, MAX_BUSIDSTR_SIZE, sizeof(busIdStr));
    CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), i));
    printf("[%s +%d %s] busIdStr=%s\n", __FILE__, __LINE__, __func__, busIdStr);

  }

  printf("[%s +%d %s] call ncclCommInitAll\n", __FILE__, __LINE__, __func__);
  //initializing NCCL
  NCCLCHECK(ncclCommInitAll(comms, nDev, devs));
  
  // get some config from comm
  int com_count = 0;
  int com_devid = 0;
  int com_rank = 0;
  for(int i = 0; i < nDev; ++i)
  {
    printf("[%s +%d %s] call ncclCommCount\n", __FILE__, __LINE__, __func__);
    NCCLCHECK(ncclCommCount(comms[i], &com_count));
    printf("[%s +%d %s] [%d] com_count=%d\n", __FILE__, __LINE__, __func__, i, com_count);
    
    printf("[%s +%d %s] call ncclCommCuDevice\n", __FILE__, __LINE__, __func__);
    NCCLCHECK(ncclCommCuDevice(comms[i], &com_devid));
    printf("[%s +%d %s] [%d] com_devid=%d\n", __FILE__, __LINE__, __func__, i, com_devid);
    
    printf("[%s +%d %s] call ncclCommUserRank\n", __FILE__, __LINE__, __func__);
    NCCLCHECK(ncclCommUserRank(comms[i], &com_rank));
    printf("[%s +%d %s] [%d] com_rank=%d\n", __FILE__, __LINE__, __func__, i, com_rank);
  }
  //finalizing NCCL
  for(int i = 0; i < nDev; ++i)
  {
    ncclCommDestroy(comms[i]);
  }

  printf("=====================end==============\n");
  return 0;
}

getRandomData 函数例子源代码和运行结果

ncclGetUniqueId API 例子源代码和运行结果

参考链接

本文标签: NCCl API ncclGetUniqueId功能实现,测试和结果验证