/* File: mpi_mat_mat_fox_mult.c
*
* Purpose: Implement parallel matrix-vector multiplication using
* one-dimensional arrays to store the vectors and the
* matrix. Vectors use block distributions and the
* matrix is distributed by block rows.
*
* Compile: mpicc -g -Wall -o mpi_mat_vect_mult mpi_mat_vect_mult.c
* Run: mpiexec -n <number of processes> ./mpi_mat_vect_mult
*
* Input: Dimensions of the square matrix
* Output: Product square matrix C = AB
*
* Errors: If an error is detected (m negative, m not evenly
* divisible by the number of processes, malloc fails), the
* program prints a message and all processes quit.
*
* Notes:
* 1. Number of processes should evenly divide both n x n
* 2. Define DEBUG for verbose output
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <math.h>
#include <time.h>
#define DEBUG 1
#define PRINT_INPUT 1
#define READ_FROM_KEYBOARD 0
#define USE_TYPE_INDEXED 0
#define MAX_SIZE 10000
typedef struct{
int n_bar; //矩陣大小 rows=cols = n_bar
int n; //矩陣元素的個數 n = n_bar^2
double matrix[MAX_SIZE];
}LOCAL_MATRIX_T;
typedef struct{
int p; /* Total number of processes */
MPI_Comm comm; /* Communicator for entire grid */
MPI_Comm row_comm; /* Communicator for my row */
MPI_Comm col_comm; /* Communicator for my col */
int q; /* Order of grid */
int my_row; /* My row number */
int my_col; /* My col number */
int my_rank; /* My rank in the grid comm */
}GRID_INFO_T;
void Setup_grid(GRID_INFO_T* grid);
void Fox(int n, GRID_INFO_T* grid, LOCAL_MATRIX_T* local_A,
LOCAL_MATRIX_T* local_B, LOCAL_MATRIX_T* local_C);
LOCAL_MATRIX_T* Local_matrix_allocate(int n_bar);
void Set_to_zero(LOCAL_MATRIX_T *local_matrix);
void Local_matrix_mutiply(LOCAL_MATRIX_T* local_A,
LOCAL_MATRIX_T* local_B, LOCAL_MATRIX_T* local_C);
void Build_mpi_type(LOCAL_MATRIX_T* local_matrix, MPI_Datatype* input_mpi_type);
void Check_for_error(int local_ok, char fname[], char message[],
MPI_Comm comm);
void Get_dims(int* n_p, int* local_n_p, int my_rank, int comm_sz, MPI_Comm comm);
void Allocate_arrays(LOCAL_MATRIX_T** local_A_pp, LOCAL_MATRIX_T** local_B_pp, LOCAL_MATRIX_T** local_C_pp, int local_n, MPI_Comm comm);
void Read_matrix(char prompt[], LOCAL_MATRIX_T* local_matrix, GRID_INFO_T* grid, int n);
void Print_matrix(char prompt[], LOCAL_MATRIX_T* local_matrix, GRID_INFO_T* grid, int n);
void Print_local_matrix(char prompt[], LOCAL_MATRIX_T *local_matrix, GRID_INFO_T* grid);
void Print(char prompt[], double M[], int n);
MPI_Datatype local_matrix_mpi_t; //将本地的子矩陣建立為MPI派生資料類型用于集合通信(資料結構類型)
#if USE_TYPE_INDEXED
MPI_Datatype local_matrix_mpi_t_i; //用于建立索引類型的MPI派生類型
void Build_mpi_type_indexed(MPI_Datatype* input_mpi_type, int n, int local_n);
#endif
/*-------------------------------------------------------------------*/
int main(void){
int n, local_n; //方矩的大小
int my_rank, comm_sz;
MPI_Comm comm;
LOCAL_MATRIX_T* local_A, *local_B, *local_C;
GRID_INFO_T grid; //建立grid拓撲結構
MPI_Init(NULL, NULL);
comm = MPI_COMM_WORLD;
MPI_Comm_size(comm, &comm_sz);
MPI_Comm_rank(comm, &my_rank);
srand((unsigned int)time(NULL));//修改種子
Get_dims(&n, &local_n, my_rank, comm_sz, comm); //擷取矩陣的參數
Allocate_arrays(&local_A, &local_B, &local_C, local_n, comm); //為矩陣配置設定空間
Set_to_zero(local_A);
Set_to_zero(local_B);
Set_to_zero(local_C);
Setup_grid(&grid);
Read_matrix("A", local_A, &grid, n);
Read_matrix("B", local_B, &grid, n);
#if USE_TYPE_INDEXED
Build_mpi_type_indexed(&local_matrix_mpi_t, n, local_n);
#endif
Build_mpi_type(local_A,&local_matrix_mpi_t);
Fox(n, &grid, local_A, local_B, local_C);
#if DEBUG
//printf("rank:%d n: %d local_n: %d\n",my_rank,n,local_n);
//printf("rank:%d &n_bar: %p &n: %p &matrix: %p\n",my_rank,&local_A->n_bar,&local_A->n,local_A->matrix);
//Print_local_matrix("local_A", local_A, &grid);
//Print_local_matrix("local_B", local_B, &grid);
//Print_local_matrix("local_C", local_C, &grid);
#endif
Print_matrix("C", local_C, &grid, n);
free(local_A);
free(local_B);
free(local_C);
MPI_Finalize();
return 0;
} /* main */
/*-------------------------------------------------------------------*/
void Check_for_error(
int local_ok /* in */,
char fname[] /* in */,
char message[] /* in */,
MPI_Comm comm /* in */) {
int ok;
MPI_Allreduce(&local_ok, &ok, 1, MPI_INT, MPI_MIN, comm);
if (ok == 0) {
int my_rank;
MPI_Comm_rank(comm, &my_rank);
if (my_rank == 0) {
fprintf(stderr, "Proc %d > In %s, %s\n", my_rank, fname,
message);
fflush(stderr);
}
MPI_Finalize();
exit(-1);
}
} /* Check_for_error */
/*-------------------------------------------------------------------*/
void Get_dims(
int* n_p /* out */,
int* local_n_p /* out */,
int my_rank /* in */,
int comm_sz /* in */,
MPI_Comm comm /* in */) {
int local_ok = 1;
if (my_rank == 0) {
printf("Enter the Dimensions(n x n) of square matrix of A\n");
scanf("%d", n_p);
}
MPI_Bcast(n_p, 1, MPI_INT, 0, comm);
if (*n_p <= 0 || (int)sqrt(comm_sz) * (int)sqrt(comm_sz) != comm_sz ||
*n_p % (int)sqrt(comm_sz) != 0) local_ok = 0;
Check_for_error(local_ok, "Get_dims",
"n must be positive and evenly divisible by comm_sz",
comm);
*local_n_p = *n_p/(int)sqrt(comm_sz);
} /* Get_dims */
void Allocate_arrays(
LOCAL_MATRIX_T** local_A_pp /* out */,
LOCAL_MATRIX_T** local_B_pp /* out */,
LOCAL_MATRIX_T** local_C_pp /* out */,
int local_n /* in */,
MPI_Comm comm /* in */) {
int local_ok = 1;
*local_A_pp = Local_matrix_allocate(local_n);
*local_B_pp = Local_matrix_allocate(local_n);
*local_C_pp = Local_matrix_allocate(local_n);
if (*local_A_pp == NULL || local_B_pp == NULL ||
local_C_pp == NULL) local_ok = 0;
Check_for_error(local_ok, "Allocate_arrays",
"Can't allocate local arrays", comm);
} /* Allocate_arrays */
/*-------------------------------------------------------------------*/
void Read_matrix(
char prompt[] /* in */,
LOCAL_MATRIX_T* local_matrix /* out */,
GRID_INFO_T* grid /* in */,
int n /* in */) {
double* M = NULL;
int local_ok = 1;
int i, j, u, v;
int dest, k;
int coordinates[2];
k = local_matrix->n_bar;
if (grid->my_rank == 0) {
v = 0;
M = malloc(n*n*sizeof(double));
if (M == NULL) local_ok = 0;
Check_for_error(local_ok, "Read_matrix",
"Can't allocate temporary matrix", grid->comm);
#if READ_FROM_KEYBOARD
printf("Enter the matrix %s\n", prompt);
#endif
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
#if READ_FROM_KEYBOARD
scanf("%lf", &M[i*n+j]);
#else
M[i*n+j] = rand() % 50; //(0-49)
#endif
#if PRINT_INPUT
Print(prompt,M,n);
#endif
#if USE_TYPE_INDEXED
for(dest = 0; dest < grid->p; dest++){
MPI_Cart_coords(grid->comm, dest, 2, coordinates);
if(dest == grid->my_rank){
for(u = 0; u < k; u++)
local_matrix->matrix[v*k + u] = M[i*n + j*k + u];
v++;
}
else
MPI_Send(&M[coordinates[0]*k + coordinates[1]*k], 1, local_matrix_mpi_t_i, dest, 0, grid->comm);
}
free(M);
} else {
Check_for_error(local_ok, "Read_matrix",
"Can't allocate temporary matrix", grid->comm);
for(i = 0; i < k; i++)
MPI_Recv(&local_matrix->matrix, k*k, MPI_DOUBLE, 0, 0, grid->comm, MPI_STATUS_IGNORE);
}
#else
for(i = 0; i < n; i++){
for(j = 0; j < grid->q; j++){
coordinates[0] = i / k;
coordinates[1] = j;
MPI_Cart_rank(grid->comm, coordinates, &dest);
if(dest == grid->my_rank){
for(u = 0; u < k; u++)
local_matrix->matrix[v*k + u] = M[i*n + j*k + u];
v++;
}
else
MPI_Send(&M[i*n+j*k], k, MPI_DOUBLE, dest, 0, grid->comm);
}
}
free(M);
} else {
Check_for_error(local_ok, "Read_matrix",
"Can't allocate temporary matrix", grid->comm);
for(i = 0; i < k; i++)
MPI_Recv(&local_matrix->matrix[k*i], k, MPI_DOUBLE, 0, 0, grid->comm, MPI_STATUS_IGNORE);
}
#endif
} /* Read_matrix */
/*-------------------------------------------------------------------*/
void Print_matrix(
char prompt[] /* in */,
LOCAL_MATRIX_T* local_matrix /* out */,
GRID_INFO_T* grid /* in */,
int n /* in */) {
double* M = NULL;
int local_ok = 1;
int i, j, u, v;
int source, k;
int coordinates[2];
k = local_matrix->n_bar;
if (grid->my_rank == 0) {
v = 0;
M = malloc(n*n*sizeof(double));
if (M == NULL) local_ok = 0;
Check_for_error(local_ok, "Read_matrix",
"Can't allocate temporary matrix", grid->comm);
#if USE_TYPE_INDEXED
for(source = 0; source < grid->p; source++){
MPI_Cart_coords(grid->comm, source, 2, coordinates);
if(source == grid->my_rank){
for(u = 0; u < k; u++)
local_matrix->matrix[v*k + u] = M[i*n + j*k + u];
v++;
}
else
MPI_Recv(&M[coordinates[0]*k + coordinates[1]*k], 1, local_matrix_mpi_t_i, source, 0, grid->comm, MPI_STATUS_IGNORE);
}
Print(prompt, M, n);
free(M);
} else {
Check_for_error(local_ok, "Read_matrix",
"Can't allocate temporary matrix", grid->comm);
for(i = 0; i < k; i++)
MPI_Recv(&local_matrix->matrix, k*k, MPI_DOUBLE, 0, 0, grid->comm);
}
#else
for(i = 0; i < n; i++){
for(j = 0; j < grid->q; j++){
coordinates[0] = i / k;
coordinates[1] = j;
MPI_Cart_rank(grid->comm, coordinates, &source);
if(source == grid->my_rank){
for(u = 0; u < k; u++)
M[i*n + j*k + u] = local_matrix->matrix[v*k + u];
v++;
}else
MPI_Recv(&M[i*n+j*k], k, MPI_DOUBLE, source, 0, grid->comm, MPI_STATUS_IGNORE);
}
}
Print(prompt, M, n);
free(M);
} else {
Check_for_error(local_ok, "Read_matrix",
"Can't allocate temporary matrix", grid->comm);
for(i = 0; i < k; i++)
MPI_Send(&local_matrix->matrix[k*i], k, MPI_DOUBLE, 0, 0, grid->comm);
}
#endif
} /* Print_matrix */
/*-------------------------------------------------------------------*/
void Print_local_matrix(
char prompt[], /* in */
LOCAL_MATRIX_T* local_matrix, /* in */
GRID_INFO_T* grid){ /* in */
char temp_prompt[50];
sprintf(temp_prompt,"%s of rank %d", prompt,grid->my_rank);
Print(temp_prompt, local_matrix->matrix, local_matrix->n_bar);
}/* Print_local_matrix */
/*-------------------------------------------------------------------*/
void Print(
char prompt[], /* in */
double M[], /* in */
int n){ /* in */
int i, j;
printf("matrix %s\n",prompt);
for(i = 0; i < n; i++){
for(j = 0; j < n; j++)
printf(" %11lf", M[i*n+j]);
printf("\n");
}
printf("\n");
}
/*-------------------------------------------------------------------*/
void Setup_grid(GRID_INFO_T* grid){
int old_rank;
int dimensions[2];
int wrap_around[2];
int coordinates[2];
int free_coords[2];
/* Set up Global Grid Information */
MPI_Comm_size(MPI_COMM_WORLD,&(grid->p));
MPI_Comm_rank(MPI_COMM_WORLD,&old_rank);
/* We assump p is a perfect square */
grid->q = (int)sqrt((double)grid->p);
dimensions[0] = dimensions[1] = grid->q;
wrap_around[0] = wrap_around[1] = 1;
MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, wrap_around, 1, &(grid->comm));
MPI_Comm_rank(grid->comm, &(grid->my_rank));
MPI_Cart_coords(grid->comm, grid->my_rank, 2, coordinates);
grid->my_row = coordinates[0];
grid->my_col = coordinates[1];
/* Set up row communicators */
free_coords[0] = 0;
free_coords[1] = 1;
MPI_Cart_sub(grid->comm, free_coords, &(grid->row_comm));
/* Set up col communicators */
free_coords[0] = 1;
free_coords[1] = 0;
MPI_Cart_sub(grid->comm, free_coords, &(grid->col_comm));
}/* Setup_grid */
/*-------------------------------------------------------------------*/
void Fox(
int n /* in */,
GRID_INFO_T* grid /* in */,
LOCAL_MATRIX_T* local_A /* in */,
LOCAL_MATRIX_T* local_B /* in */,
LOCAL_MATRIX_T* local_C /* out */){
LOCAL_MATRIX_T* temp_A; /* Storage for the submatrix of A*/
/* used during the current stage */
int stage;
int bcast_root;
int n_bar;
int source;
int dest;
MPI_Status status;
n_bar = n / grid->q; /* the dimension of submatrix */
Set_to_zero(local_C);
/* Calculate addresses for circular shift of B */
source = (grid->my_row + 1) % grid->q;
dest = (grid->my_row -1 + grid->q) % grid->q;
/* Set aside storage for the broadcast block of A */
temp_A = Local_matrix_allocate(n_bar);
for(stage = 0; stage < grid->q; stage++){
bcast_root = (grid->my_row + stage) % grid->q;
if(bcast_root == grid->my_col){
MPI_Bcast(local_A, 1, local_matrix_mpi_t, bcast_root, grid->row_comm);
Local_matrix_mutiply(local_A, local_B,local_C);
}else{
MPI_Bcast(temp_A, 1, local_matrix_mpi_t, bcast_root, grid->row_comm);
Local_matrix_mutiply(temp_A, local_B, local_C);
}
MPI_Sendrecv_replace(local_B, 1, local_matrix_mpi_t, dest, 0,
source, 0, grid->col_comm, &status);
}/* for */
free(temp_A);
}/* Fox */
/*-------------------------------------------------------------------*/
void Build_mpi_type_indexed(MPI_Datatype* input_mpi_type,int n, int local_n){
int i;
int count = local_n;
int array_of_blocklength[count];
int array_of_displacements[count];
for(i = 0; i < count; i++){
array_of_blocklength[i] = local_n;
array_of_displacements[i] = n*i;
}
MPI_Type_indexed(3, array_of_blocklength, array_of_displacements,
MPI_DOUBLE, input_mpi_type);
MPI_Type_commit(input_mpi_type);
}/* Build_mpi_type_indexed */
/*-------------------------------------------------------------------*/
void Build_mpi_type(
LOCAL_MATRIX_T* local_matrix /* in */,
MPI_Datatype* input_mpi_type /* out*/){
int count = 3;
int array_of_blocklengths[3]= {1,1,local_matrix->n};
MPI_Datatype array_of_types[3] = {MPI_INT, MPI_INT, MPI_DOUBLE};
MPI_Aint n_bar_addr, n_addr, matrix_addr;
MPI_Aint array_of_displacements[3] = {0};
MPI_Get_address(&local_matrix->n_bar, &n_bar_addr);
MPI_Get_address(&local_matrix->n, &n_addr);
MPI_Get_address(&local_matrix->matrix, &matrix_addr);
array_of_displacements[1] = n_addr - n_bar_addr;
array_of_displacements[2] = matrix_addr - n_addr;
MPI_Type_create_struct(count, array_of_blocklengths, array_of_displacements, array_of_types, input_mpi_type);
MPI_Type_commit(input_mpi_type); //使新資料類型能夠通信
}/* Build_mpi_type */
/*-------------------------------------------------------------------*/
LOCAL_MATRIX_T* Local_matrix_allocate(
int n_bar /* in */){
LOCAL_MATRIX_T *temp_matrix = (LOCAL_MATRIX_T*)malloc(sizeof(LOCAL_MATRIX_T));
temp_matrix->n_bar = n_bar;
temp_matrix->n = n_bar*n_bar;
return temp_matrix;
}/* Local_matrix_allocate */
/*-------------------------------------------------------------------*/
void Set_to_zero(
LOCAL_MATRIX_T *local_matrix /* in */){
int i;
for(i = 0; i < local_matrix->n; i++){
local_matrix->matrix[i] = 0;
}
}/* Set_to_zero */
/*-------------------------------------------------------------------*/
void Local_matrix_mutiply(
LOCAL_MATRIX_T* local_A /* in */,
LOCAL_MATRIX_T* local_B /* in */,
LOCAL_MATRIX_T* local_C /* out */){
int i,j,k,n = local_C->n_bar;
for(i = 0; i < n; i++){
for(j = 0; j < n; j++){
for(k = 0; k < n; k++)
local_C->matrix[i*n+j] += local_A->matrix[i*n+k]*local_B->matrix[k*n+j];
}
}
}/* Local_matrix_mutiply */