MPI Code for Ghost Data Exchange in 3D Domain Decomposition with Multi-GPUs

Here’s a comprehensive MPI code that demonstrates ghost data exchange for a 3D domain decomposition across multiple GPUs. This implementation assumes you’re using CUDA-aware MPI for efficient GPU-to-GPU communication.

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <cuda_runtime.h>// Define the dimensions of the global domain
#define NX_GLOBAL 256
#define NY_GLOBAL 256
#define NZ_GLOBAL 256// Define the ghost layer width
#define GHOST_WIDTH 2// Macro for checking CUDA errors
#define CUDA_CHECK(cmd) {cudaError_t error = cmd; if(error!=cudaSuccess){printf("<%s>:%i ",__FILE__,__LINE__); printf("[CUDA] Error: %s\n", cudaGetErrorString(error));exit(EXIT_FAILURE);}}// Structure to hold domain decomposition information
typedef struct {int rank;           // MPI rankint size;           // Number of MPI processesint dims[3];        // Process grid dimensionsint coords[3];      // Process coordinates in gridMPI_Comm cart_comm; // Cartesian communicator// Local domain sizes (excluding ghost layers)int nx, ny, nz;// Starting indices in global domainint start_x, start_y, start_z;// Device pointers for data and ghost buffersdouble *d_data;         // Main data array on devicedouble *d_send_buffers[6]; // Send buffers for each facedouble *d_recv_buffers[6]; // Receive buffers for each face
} DomainInfo;// Initialize domain decomposition
void init_domain_decomposition(DomainInfo *domain, int argc, char **argv) {MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &domain->rank);MPI_Comm_size(MPI_COMM_WORLD, &domain->size);// Create a 3D Cartesian topologyint periods[3] = {1, 1, 1}; // Periodic boundariesdomain->dims[0] = domain->dims[1] = domain->dims[2] = 0;MPI_Dims_create(domain->size, 3, domain->dims);MPI_Cart_create(MPI_COMM_WORLD, 3, domain->dims, periods, 1, &domain->cart_comm);MPI_Cart_coords(domain->cart_comm, domain->rank, 3, domain->coords);// Calculate local domain sizes (excluding ghost layers)domain->nx = NX_GLOBAL / domain->dims[0];domain->ny = NY_GLOBAL / domain->dims[1];domain->nz = NZ_GLOBAL / domain->dims[2];// Calculate starting indices in global domaindomain->start_x = domain->coords[0] * domain->nx;domain->start_y = domain->coords[1] * domain->ny;domain->start_z = domain->coords[2] * domain->nz;// Adjust for non-divisible casesif (domain->coords[0] == domain->dims[0]-1) domain->nx = NX_GLOBAL - domain->start_x;if (domain->coords[1] == domain->dims[1]-1) domain->ny = NY_GLOBAL - domain->start_y;if (domain->coords[2] == domain->dims[2]-1) domain->nz = NZ_GLOBAL - domain->start_z;// Allocate device memory for main data array (including ghost layers)size_t data_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);CUDA_CHECK(cudaMalloc(&domain->d_data, data_size));CUDA_CHECK(cudaMemset(domain->d_data, 0, data_size));// Allocate send and receive buffers for each facefor (int i = 0; i < 6; i++) {size_t buffer_size = 0;if (i < 2) { // X facesbuffer_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);} else if (i < 4) { // Y facesbuffer_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);} else { // Z facesbuffer_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH * sizeof(double);}CUDA_CHECK(cudaMalloc(&domain->d_send_buffers[i], buffer_size));CUDA_CHECK(cudaMalloc(&domain->d_recv_buffers[i], buffer_size));}
}// Pack data into send buffers
__global__ void pack_x_face(double *data, double *buffer, int nx, int ny, int nz, int ghost_width, int face) {int j = blockIdx.x * blockDim.x + threadIdx.x;int k = blockIdx.y * blockDim.y + threadIdx.y;if (j < ny + 2*ghost_width && k < nz + 2*ghost_width) {int src_idx;if (face == 0) { // Left facesrc_idx = ghost_width * (ny + 2*ghost_width) * (nz + 2*ghost_width) + j * (nz + 2*ghost_width) + k;} else { // Right facesrc_idx = (nx + ghost_width) * (ny + 2*ghost_width) * (nz + 2*ghost_width) + j * (nz + 2*ghost_width) + k;}int buf_idx = j * (nz + 2*ghost_width) + k;buffer[buf_idx] = data[src_idx];}
}// Similar pack kernels for y and z faces would be defined here...// Unpack data from receive buffers
__global__ void unpack_x_face(double *data, double *buffer, int nx, int ny, int nz, int ghost_width, int face) {int j = blockIdx.x * blockDim.x + threadIdx.x;int k = blockIdx.y * blockDim.y + threadIdx.y;if (j < ny + 2*ghost_width && k < nz + 2*ghost_width) {int dest_idx;if (face == 0) { // Left facedest_idx = j * (nz + 2*ghost_width) + k;} else { // Right facedest_idx = (nx + 2*ghost_width) * (ny + 2*ghost_width) * (nz + 2*ghost_width) + j * (nz + 2*ghost_width) + k;}int buf_idx = j * (nz + 2*ghost_width) + k;data[dest_idx] = buffer[buf_idx];}
}// Similar unpack kernels for y and z faces would be defined here...// Exchange ghost data with neighbors
void exchange_ghost_data(DomainInfo *domain) {MPI_Request requests[12]; // 6 sends + 6 receivesint request_count = 0;MPI_Status statuses[12];// Determine neighbor ranksint neighbors[6];MPI_Cart_shift(domain->cart_comm, 0, 1, &neighbors[0], &neighbors[1]); // X directionMPI_Cart_shift(domain->cart_comm, 1, 1, &neighbors[2], &neighbors[3]); // Y directionMPI_Cart_shift(domain->cart_comm, 2, 1, &neighbors[4], &neighbors[5]); // Z direction// Post receives firstfor (int face = 0; face < 6; face++) {if (neighbors[face] != MPI_PROC_NULL) {size_t recv_size = 0;if (face < 2) { // X facesrecv_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH);} else if (face < 4) { // Y facesrecv_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH);} else { // Z facesrecv_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH;}MPI_Irecv(domain->d_recv_buffers[face], recv_size, MPI_DOUBLE, neighbors[face], face, domain->cart_comm, &requests[request_count++]);}}// Pack and send datafor (int face = 0; face < 6; face++) {if (neighbors[face] != MPI_PROC_NULL) {// Launch appropriate pack kernel based on facedim3 block(16, 16);dim3 grid;if (face < 2) { // X facesgrid.x = (domain->ny + 2*GHOST_WIDTH + block.x - 1) / block.x;grid.y = (domain->nz + 2*GHOST_WIDTH + block.y - 1) / block.y;pack_x_face<<<grid, block>>>(domain->d_data, domain->d_send_buffers[face], domain->nx, domain->ny, domain->nz, GHOST_WIDTH, face);} // Similar for y and z faces would be here...CUDA_CHECK(cudaDeviceSynchronize());size_t send_size = 0;if (face < 2) { // X facessend_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH);} else if (face < 4) { // Y facessend_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH);} else { // Z facessend_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH;}MPI_Isend(domain->d_send_buffers[face], send_size, MPI_DOUBLE, neighbors[face], face, domain->cart_comm, &requests[request_count++]);}}// Wait for all communications to completeMPI_Waitall(request_count, requests, statuses);// Unpack received datafor (int face = 0; face < 6; face++) {if (neighbors[face] != MPI_PROC_NULL) {// Launch appropriate unpack kernel based on facedim3 block(16, 16);dim3 grid;if (face < 2) { // X facesgrid.x = (domain->ny + 2*GHOST_WIDTH + block.x - 1) / block.x;grid.y = (domain->nz + 2*GHOST_WIDTH + block.y - 1) / block.y;unpack_x_face<<<grid, block>>>(domain->d_data, domain->d_recv_buffers[face], domain->nx, domain->ny, domain->nz, GHOST_WIDTH, face);} // Similar for y and z faces would be here...CUDA_CHECK(cudaDeviceSynchronize());}}
}// Clean up resources
void cleanup(DomainInfo *domain) {CUDA_CHECK(cudaFree(domain->d_data));for (int i = 0; i < 6; i++) {CUDA_CHECK(cudaFree(domain->d_send_buffers[i]));CUDA_CHECK(cudaFree(domain->d_recv_buffers[i]));}MPI_Finalize();
}int main(int argc, char **argv) {DomainInfo domain;init_domain_decomposition(&domain, argc, argv);// Main simulation loop would go herefor (int step = 0; step < 10; step++) {// Perform computation on local domain// Exchange ghost dataexchange_ghost_data(&domain);// Continue computation}cleanup(&domain);return 0;
}

Key Features of This Implementation:

3D Domain Decomposition:
- Uses MPI Cartesian topology for logical process arrangement
- Handles both divisible and non-divisible domain sizes
GPU-Aware MPI:
- Directly passes GPU pointers to MPI calls (requires CUDA-aware MPI)
- Avoids host-staging for better performance
Efficient Communication:
- Non-blocking sends and receives for overlap opportunities
- Separate buffers for each face to prevent contention
Kernel-Based Packing/Unpacking:
- CUDA kernels for efficient data movement between main array and buffers
- Parallel packing/unpacking operations
Flexible Ghost Layer Width:
- Configurable ghost layer size via GHOST_WIDTH macro

Requirements:

MPI implementation with CUDA-aware support (OpenMPI, MVAPICH2, etc.)
CUDA toolkit
Compilation with nvcc and MPI compiler wrappers

Usage Notes:

You’ll need to complete the pack/unpack kernels for Y and Z faces (omitted for brevity)
The code assumes periodic boundaries (adjust periods array if needed)
For optimal performance, tune the block/grid dimensions in the kernels
Add error checking for MPI operations in production code

This implementation provides a solid foundation for multi-GPU simulations with 3D domain decomposition and ghost data exchange.