MPI Code for Ghost Data Exchange in 3D Domain Decomposition with Multi-GPUs

MPI Code for Ghost Data Exchange in 3D Domain Decomposition with Multi-GPUs

Here’s a comprehensive MPI code that demonstrates ghost data exchange for a 3D domain decomposition across multiple GPUs. This implementation assumes you’re using CUDA-aware MPI for efficient GPU-to-GPU communication.

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <cuda_runtime.h>// Define the dimensions of the global domain
#define NX_GLOBAL 256
#define NY_GLOBAL 256
#define NZ_GLOBAL 256// Define the ghost layer width
#define GHOST_WIDTH 2// Macro for checking CUDA errors
#define CUDA_CHECK(cmd) {cudaError_t error = cmd; if(error!=cudaSuccess){printf("<%s>:%i ",__FILE__,__LINE__); printf("[CUDA] Error: %s\n", cudaGetErrorString(error));exit(EXIT_FAILURE);}}// Structure to hold domain decomposition information
typedef struct {int rank;           // MPI rankint size;           // Number of MPI processesint dims[3];        // Process grid dimensionsint coords[3];      // Process coordinates in gridMPI_Comm cart_comm; // Cartesian communicator// Local domain sizes (excluding ghost layers)int nx, ny, nz;// Starting indices in global domainint start_x, start_y, start_z;// Device pointers for data and ghost buffersdouble *d_data;         // Main data array on devicedouble *d_send_buffers[6]; // Send buffers for each facedouble *d_recv_buffers[6]; // Receive buffers for each face
} DomainInfo;// Initialize domain decomposition
void init_domain_decomposition(DomainInfo *domain, int argc, char **argv) {MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &domain->rank);MPI_Comm_size(MPI_COMM_WORLD, &domain->size);// Create a 3D Cartesian topologyint periods[3] = {1, 1, 1}; // Periodic boundariesdomain->dims[0] = domain->dims[1] = domain->dims[2] = 0;MPI_Dims_create(domain->size, 3, domain->dims);MPI_Cart_create(MPI_COMM_WORLD, 3, domain->dims, periods, 1, &domain->cart_comm);MPI_Cart_coords(domain->cart_comm, domain->rank, 3, domain->coords);// Calculate local domain sizes (excluding ghost layers)domain->nx = NX_GLOBAL / domain->dims[0];domain->ny = NY_GLOBAL / domain->dims[1];domain->nz = NZ_GLOBAL / domain->dims[2];// Calculate starting indices in global domaindomain->start_x = domain->coords[0] * domain->nx;domain->start_y = domain->coords[1] * domain->ny;domain->start_z = domain->coords[2] * domain->nz;// Adjust for non-divisible casesif (domain->coords[0] == domain->dims[0]-1) domain->nx = NX_GLOBAL - domain->start_x;if (domain->coords[1] == domain->dims[1]-1) domain->ny = NY_GLOBAL - domain->start_y;if (domain->coords[2] == domain->dims[2]-1) domain->nz = NZ_GLOBAL - domain->start_z;// Allocate device memory for main data array (including ghost layers)size_t data_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);CUDA_CHECK(cudaMalloc(&domain->d_data, data_size));CUDA_CHECK(cudaMemset(domain->d_data, 0, data_size));// Allocate send and receive buffers for each facefor (int i = 0; i < 6; i++) {size_t buffer_size = 0;if (i < 2) { // X facesbuffer_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);} else if (i < 4) { // Y facesbuffer_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);} else { // Z facesbuffer_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH * sizeof(double);}CUDA_CHECK(cudaMalloc(&domain->d_send_buffers[i], buffer_size));CUDA_CHECK(cudaMalloc(&domain->d_recv_buffers[i], buffer_size));}
}// Pack data into send buffers
__global__ void pack_x_face(double *data, double *buffer, int nx, int ny, int nz, int ghost_width, int face) {int j = blockIdx.x * blockDim.x + threadIdx.x;int k = blockIdx.y * blockDim.y + threadIdx.y;if (j < ny + 2*ghost_width && k < nz + 2*ghost_width) {int src_idx;if (face == 0) { // Left facesrc_idx = ghost_width * (ny + 2*ghost_width) * (nz + 2*ghost_width) + j * (nz + 2*ghost_width) + k;} else { // Right facesrc_idx = (nx + ghost_width) * (ny + 2*ghost_width) * (nz + 2*ghost_width) + j * (nz + 2*ghost_width) + k;}int buf_idx = j * (nz + 2*ghost_width) + k;buffer[buf_idx] = data[src_idx];}
}// Similar pack kernels for y and z faces would be defined here...// Unpack data from receive buffers
__global__ void unpack_x_face(double *data, double *buffer, int nx, int ny, int nz, int ghost_width, int face) {int j = blockIdx.x * blockDim.x + threadIdx.x;int k = blockIdx.y * blockDim.y + threadIdx.y;if (j < ny + 2*ghost_width && k < nz + 2*ghost_width) {int dest_idx;if (face == 0) { // Left facedest_idx = j * (nz + 2*ghost_width) + k;} else { // Right facedest_idx = (nx + 2*ghost_width) * (ny + 2*ghost_width) * (nz + 2*ghost_width) + j * (nz + 2*ghost_width) + k;}int buf_idx = j * (nz + 2*ghost_width) + k;data[dest_idx] = buffer[buf_idx];}
}// Similar unpack kernels for y and z faces would be defined here...// Exchange ghost data with neighbors
void exchange_ghost_data(DomainInfo *domain) {MPI_Request requests[12]; // 6 sends + 6 receivesint request_count = 0;MPI_Status statuses[12];// Determine neighbor ranksint neighbors[6];MPI_Cart_shift(domain->cart_comm, 0, 1, &neighbors[0], &neighbors[1]); // X directionMPI_Cart_shift(domain->cart_comm, 1, 1, &neighbors[2], &neighbors[3]); // Y directionMPI_Cart_shift(domain->cart_comm, 2, 1, &neighbors[4], &neighbors[5]); // Z direction// Post receives firstfor (int face = 0; face < 6; face++) {if (neighbors[face] != MPI_PROC_NULL) {size_t recv_size = 0;if (face < 2) { // X facesrecv_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH);} else if (face < 4) { // Y facesrecv_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH);} else { // Z facesrecv_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH;}MPI_Irecv(domain->d_recv_buffers[face], recv_size, MPI_DOUBLE, neighbors[face], face, domain->cart_comm, &requests[request_count++]);}}// Pack and send datafor (int face = 0; face < 6; face++) {if (neighbors[face] != MPI_PROC_NULL) {// Launch appropriate pack kernel based on facedim3 block(16, 16);dim3 grid;if (face < 2) { // X facesgrid.x = (domain->ny + 2*GHOST_WIDTH + block.x - 1) / block.x;grid.y = (domain->nz + 2*GHOST_WIDTH + block.y - 1) / block.y;pack_x_face<<<grid, block>>>(domain->d_data, domain->d_send_buffers[face], domain->nx, domain->ny, domain->nz, GHOST_WIDTH, face);} // Similar for y and z faces would be here...CUDA_CHECK(cudaDeviceSynchronize());size_t send_size = 0;if (face < 2) { // X facessend_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH);} else if (face < 4) { // Y facessend_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH);} else { // Z facessend_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH;}MPI_Isend(domain->d_send_buffers[face], send_size, MPI_DOUBLE, neighbors[face], face, domain->cart_comm, &requests[request_count++]);}}// Wait for all communications to completeMPI_Waitall(request_count, requests, statuses);// Unpack received datafor (int face = 0; face < 6; face++) {if (neighbors[face] != MPI_PROC_NULL) {// Launch appropriate unpack kernel based on facedim3 block(16, 16);dim3 grid;if (face < 2) { // X facesgrid.x = (domain->ny + 2*GHOST_WIDTH + block.x - 1) / block.x;grid.y = (domain->nz + 2*GHOST_WIDTH + block.y - 1) / block.y;unpack_x_face<<<grid, block>>>(domain->d_data, domain->d_recv_buffers[face], domain->nx, domain->ny, domain->nz, GHOST_WIDTH, face);} // Similar for y and z faces would be here...CUDA_CHECK(cudaDeviceSynchronize());}}
}// Clean up resources
void cleanup(DomainInfo *domain) {CUDA_CHECK(cudaFree(domain->d_data));for (int i = 0; i < 6; i++) {CUDA_CHECK(cudaFree(domain->d_send_buffers[i]));CUDA_CHECK(cudaFree(domain->d_recv_buffers[i]));}MPI_Finalize();
}int main(int argc, char **argv) {DomainInfo domain;init_domain_decomposition(&domain, argc, argv);// Main simulation loop would go herefor (int step = 0; step < 10; step++) {// Perform computation on local domain// Exchange ghost dataexchange_ghost_data(&domain);// Continue computation}cleanup(&domain);return 0;
}

Key Features of This Implementation:

  1. 3D Domain Decomposition:

    • Uses MPI Cartesian topology for logical process arrangement
    • Handles both divisible and non-divisible domain sizes
  2. GPU-Aware MPI:

    • Directly passes GPU pointers to MPI calls (requires CUDA-aware MPI)
    • Avoids host-staging for better performance
  3. Efficient Communication:

    • Non-blocking sends and receives for overlap opportunities
    • Separate buffers for each face to prevent contention
  4. Kernel-Based Packing/Unpacking:

    • CUDA kernels for efficient data movement between main array and buffers
    • Parallel packing/unpacking operations
  5. Flexible Ghost Layer Width:

    • Configurable ghost layer size via GHOST_WIDTH macro

Requirements:

  1. MPI implementation with CUDA-aware support (OpenMPI, MVAPICH2, etc.)
  2. CUDA toolkit
  3. Compilation with nvcc and MPI compiler wrappers

Usage Notes:

  1. You’ll need to complete the pack/unpack kernels for Y and Z faces (omitted for brevity)
  2. The code assumes periodic boundaries (adjust periods array if needed)
  3. For optimal performance, tune the block/grid dimensions in the kernels
  4. Add error checking for MPI operations in production code

This implementation provides a solid foundation for multi-GPU simulations with 3D domain decomposition and ghost data exchange.

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/pingmian/78563.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

计算机考研精炼 计网

第 19 章 计算机网络体系结构 19.1 基本概念 19.1.1 计算机网络概述 1.计算机网络的定义、组成与功能 计算机网络是一个将分散的、具有独立功能的计算机系统&#xff0c;通过通信设备与线路连接起来&#xff0c;由功能完善的软件实现资源共享和信息传递的系统。 …

KUKA机器人自动备份设置

在机器人的使用过程中&#xff0c;对机器人做备份不仅能方便查看机器人的项目配置与程序&#xff0c;还能防止机器人项目和程序丢失时进行及时的还原&#xff0c;因此对机器人做备份是很有必要的。 对于KUKA机器人来说&#xff0c;做备份可以通过U盘来操作。也可以在示教器上设…

【wpf】 WPF中实现动态加载图片浏览器(边滚动边加载)

WPF中实现动态加载图片浏览器&#xff08;边滚动边加载&#xff09; 在做图片浏览器程序时&#xff0c;遇到图片数量巨大的情况&#xff08;如几百张、上千张&#xff09;&#xff0c;一次性加载所有图片会导致界面卡顿甚至程序崩溃。 本文介绍一种 WPF Prism 实现动态分页加…

Kubernetes》》k8s》》Taint 污点、Toleration容忍度

污点 》》 节点上 容忍度 》》 Pod上 在K8S中&#xff0c;如果Pod能容忍某个节点上的污点&#xff0c;那么Pod就可以调度到该节点。如果不能容忍&#xff0c;那就无法调度到该节点。 污点和容忍度的概念 》》污点等级——>node 》》容忍度 —>pod Equal——>一种是等…

SEO长尾关键词优化核心策略

内容概要 在搜索引擎优化领域&#xff0c;长尾关键词因其精准的流量捕获能力与较低的竞争强度&#xff0c;已成为提升网站自然流量的核心突破口。本文围绕长尾关键词优化的全链路逻辑&#xff0c;系统拆解从需求洞察到落地执行的五大策略模块&#xff0c;涵盖用户搜索意图解析…

AWS中国区ICP备案全攻略:流程、注意事项与最佳实践

导语 在中国大陆地区开展互联网业务时,所有通过域名提供服务的网站和应用必须完成ICP备案(互联网内容提供商备案)。对于选择使用AWS中国区(北京/宁夏区域)资源的用户,备案流程因云服务商的特殊运营模式而有所不同。本文将详细解析AWS中国区备案的核心规则、操作步骤及避坑…

计算机视觉——通过 OWL-ViT 实现开放词汇对象检测

介绍 传统的对象检测模型大多是封闭词汇类型&#xff0c;只能识别有限的固定类别。增加新的类别需要大量的注释数据。然而&#xff0c;现实世界中的物体类别几乎无穷无尽&#xff0c;这就需要能够检测未知类别的开放式词汇类型。对比学习&#xff08;Contrastive Learning&…

大语言模型的“模型量化”详解 - 04:KTransformers MoE推理优化技术

基本介绍 随着大语言模型&#xff08;LLM&#xff09;的规模不断扩大&#xff0c;模型的推理效率和计算资源的需求也在迅速增加。DeepSeek-V2作为当前热门的LLM之一&#xff0c;通过创新的架构设计与优化策略&#xff0c;在资源受限环境下实现了高效推理。 本文将详细介绍Dee…

排序算法详解笔记

评价维度 运行效率就地性稳定性 自适应性&#xff1a;自适应排序能够利用输入数据已有的顺序信息来减少计算量&#xff0c;达到更优的时间效率。自适应排序算法的最佳时间复杂度通常优于平均时间复杂度。 是否基于比较&#xff1a;基于比较的排序依赖比较运算符&#xff08;…

【“星瑞” O6 评测】 — llm CPU部署对比高通骁龙CPU

前言 随着大模型应用场景的不断拓展&#xff0c;arm cpu 凭借其独特优势在大模型推理领域的重要性日益凸显。它在性能、功耗、架构适配等多方面发挥关键作用&#xff0c;推动大模型在不同场景落地 1. CPU对比 星睿 O6 CPU 采用 Armv9 架构&#xff0c;集成了 Armv9 CPU 核心…

Ocelot的应用案例

搭建3个项目&#xff0c;分别是OcelotDemo、ServerApi1和ServerApi2这3个项目。访问都是通过OcelotDemo进行轮训转发。 代码案例链接&#xff1a;https://download.csdn.net/download/ly1h1/90715035 1.架构图 2.解决方案结构 3.步骤一&#xff0c;添加Nuget包 4.步骤二&…

DeepSeek+Dify之五工作流引用API案例

DeepSeekDify之四Agent引用知识库案例 文章目录 背景整体流程测试数据用到的节点开始HTTP请求LLM参数提取器代码执行结束 实现步骤1、新建工作流2、开始节点3、Http请求节点4、LLM节点&#xff08;大模型检索&#xff09;5、参数提取器节点&#xff08;提取大模型检索后数据&am…

《从分遗产说起:JS 原型与继承详解》

“天天开心就好” 先来讲讲概念&#xff1a; 原型&#xff08;Prototype&#xff09; 什么是原型&#xff1f; 原型是 JavaScript 中实现对象间共享属性和方法的机制。每个 JavaScript 对象&#xff08;除了 null&#xff09;都有一个内部链接指向另一个对象&#xff0c;这…

立马耀:通过阿里云 Serverless Spark 和 Milvus 构建高效向量检索系统,驱动个性化推荐业务

作者&#xff1a;厦门立马耀网络科技有限公司大数据开发工程师 陈宏毅 背景介绍 行业 蝉选是蝉妈妈出品的达人选品服务平台。蝉选秉持“陪伴达人赚到钱”的品牌使命&#xff0c;致力于洞悉达人变现需求和痛点&#xff0c;提供达人选高佣、稳变现、速响应的选品服务。 业务特…

Android显示学习笔记本

根据博客 Android-View 绘制原理(01)-JAVA层分析_android view draw原理分析-CSDN博客 提出了我的疑问 Canvas RenderNode updateDisplayListDirty 这些东西的关系 您的理解在基本方向上是对的&#xff0c;但让我详细解释一下 Android 中 updateDisplayListDirty、指令集合、…

JavaWeb学习打卡-Day4-会话技术、JWT、Filter、Interceptor

会话技术 会话&#xff1a;用户打开浏览器&#xff0c;访问web服务器的资源&#xff0c;会话建立&#xff0c;直到有一方断开连接&#xff0c;会话结束。在一次会话中可以包含多次请求和响应。会话跟踪&#xff1a;一种维护浏览器状态的方法&#xff0c;服务器需要识别多次请求…

让数据优雅落地:用 serde::Deserialize 玩转结构体实体

前言 想象一下,服务器突然飞来一堆 JSON 数据,就像一群无头苍蝇冲进办公室,嗡嗡作响,横冲直撞。此刻,你的任务,就是把这群“迷路数据”安置进正确的格子里,分门别类,秩序井然,不混不乱,不漏一只。 好在 Rust 早就为我们备好瑞士军刀:serde::Deserialize。它不仅刀…

Virtio 技术解析 | 框架、设备实现与实践指南

本文为 “Virtio” 相关文章合辑。 略作重排&#xff0c;如有内容异常&#xff0c;请看原文。 Virtio 简介&#xff08;一&#xff09;—— 框架分析 posted 2021-04-21 10:14 Edver 1. 概述 在传统设备模拟中&#xff0c;虚拟机内部设备驱动完全不知自身处于虚拟化环境&a…

云计算赋能质检LIMS的价值 质检LIMS系统在云计算企业的创新应用

在云计算技术高速发展的背景下&#xff0c;实验室信息化管理正经历深刻变革。质检LIMS&#xff08;实验室信息管理系统&#xff09;作为实验室数字化转型的核心工具&#xff0c;通过与云计算深度融合&#xff0c;为企业提供了高弹性、高安全性的解决方案。本文将探讨质检LIMS在…

【win11 安装WSL2 详解一遍过!!】

共有五个步骤&#xff0c;按部就班的做&#xff0c;保准成功&#xff01; 1. 打开开发者模式 设置->系统->开发者模式 2. 打开linux的win子系统 找到控制面板-程序和功能-启用或关闭Windows功能&#xff0c;选中“适用于Linux的Windows子系统”&#xff0c;“虚拟机…