这里给出Cannon矩阵乘法例子。计算任务描述如下:
  A,B为n×n维的稠密矩阵。计算A,B的积,并存放到C,即:
       C = A × B
  Cannon的矩阵乘法算法把p个处理器看成是一个二维的网格,各处理器在网格的坐标记为p(i, j),其中i,j = 0, 1, ...,-1。并将A,B,C分别划分为的块,对各块编号,将A(i,j),B(i,j),C(i,j)分给p(i,j)。当完成数据初始赋值后,Cannon的算法按步进行,在每一步中,每个处理器完成本地的A(i,j)和B(i,j)(它们都是的矩阵)的乘法,然后将A发送给自己左边的处理器,B发送给自己上边的处理器(边界上的处理器回绕)。算法完成后,C的值按块分布在各个处理器上。
  //Canno's MM algorithm.
  #include <stdlib.h>
  #include <mpi.h>

  //将处理器的网格坐标转换为处理器号
  int CartToRank(int row, int col, int npdim)
  {
   return ((row+npdim)%npdim)*npdim + (col+npdim)%npdim;
  }

  //块矩阵相乘
  void MatrixMultiply(int nlocal, double *a, double *b, double *c)
  {
   int i,j,k;
   for( i = 0; i < nlocal; i++ )
    for( j = 0; j < nlocal; j++ )
     for( k =0; k < nlocal; k++ )
     {
      c[i,j] += a[i,k] * b[k,j];
     }
   }

  void main(int argc, char **argv)
  {
   int i;
   int n, nlocal;
   double *a, *b, *c, *wspace;
   int npes, npdim;
   int myrank, myrow, mycol;
   MPI_Status status;

   /* 初始化,得到系统信息 */
   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &npes);
   MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

   if (argc != 2) {
    //需要给出矩阵的维数
    if (myrank == 0)
     printf("Usage: %s <the dimension of the matrix>\n", argv[0]);
    MPI_Finalize();
   exit(0);
  }
  //矩阵的维数
  n = atoi(argv[1]);

  //处理器网格每一维的处理器数
  npdim = sqrt(npes);
  if (npdim*npdim != npes) {
   //处理器数目必须是平方数
   if (myrank == 0)
     printf("The number of processes must be a perfect square.\n");
   MPI_Finalize();
   exit(0);
  }

  //本地的块矩阵的维数
  nlocal = n/npdim;

  //根据处理器号得到自己在处理器网格的坐标(myrow, mycol)
  myrow = myrank/npdim;
  mycol = myrank%npdim;

  //分配本地计算所需的空间
  a = (double *)malloc(nlocal*nlocal*sizeof(double));
  b = (double *)malloc(nlocal*nlocal*sizeof(double));
  c = (double *)malloc(nlocal*nlocal*sizeof(double));

  wspace = (double *)malloc(nlocal*nlocal*sizeof(double));

  //给矩阵赋初值(a,b的内容实际上没有实际意义)。
  srand48((long)myrank);
  for (i=0; i<nlocal*nlocal; i++) {
   a[i] = b[i] = drand48();
   c[i] = 0.0;
  }
 
  /* Perform the initial matrix alignment. First for a and then for b */
  /* 进行初始对齐,先a再b。设处理器坐标为(i,j),则本地的a将传给 p(i, j-i),
  即左移i列,b将传给p(i-j,j),即上移j列*/
  /* MPI_Sendrecv的形式
  int MPI_Sendrecv(void *sendbuf, int sendcount, MPI_Datatype senddatatype,
       int dest, int sendtag, void *recvbuf, int recvcount,
       MPI_Datatype recvdatatype, int source, int recvtag,
       MPI_Comm comm, MPI_Status *status)
  */
  MPI_Sendrecv(a, nlocal*nlocal, MPI_DOUBLE,
       CartToRank(myrow, mycol-myrow, npdim),
       1, wspace, nlocal*nlocal, MPI_DOUBLE,
       CartToRank(myrow, mycol+myrow, npdim),
       1, MPI_COMM_WORLD, &status);
  memcpy(a, wspace, nlocal*nlocal*sizeof(double));

  MPI_Sendrecv(b, nlocal*nlocal, MPI_DOUBLE,
       CartToRank(myrow-mycol, mycol, npdim),
       1, wspace, nlocal*nlocal, MPI_DOUBLE,
       CartToRank(myrow+mycol, mycol, npdim),
       1, MPI_COMM_WORLD, &status);
  memcpy(b, wspace, nlocal*nlocal*sizeof(double));

  /* 主循环*/
  for (i=0; i<npdim; i++) {
   MatrixMultiply(nlocal, a, b, c); /* c = c + a*b */

  /* a左移 */
   MPI_Sendrecv(a, nlocal*nlocal, MPI_DOUBLE,
       CartToRank(myrow, mycol-1, npdim),
       1, wspace, nlocal*nlocal, MPI_DOUBLE,
       CartToRank(myrow, mycol+1, npdim),
       1, MPI_COMM_WORLD, &status);
  memcpy(a, wspace, nlocal*nlocal*sizeof(double));

  /* b 上移1 */
  MPI_Sendrecv(b, nlocal*nlocal, MPI_DOUBLE,
       CartToRank(myrow-1, mycol, npdim),
       1, wspace, nlocal*nlocal, MPI_DOUBLE,
       CartToRank(myrow+1, mycol, npdim),
       1, MPI_COMM_WORLD, &status);
  memcpy(b, wspace, nlocal*nlocal*sizeof(double));
  }
  /*得到原来的a,b,不是必须,但可以检验算法实现 */
  MPI_Sendrecv(a, nlocal*nlocal, MPI_DOUBLE,
       CartToRank(myrow, mycol+myrow, npdim),
       1, wspace, nlocal*nlocal, MPI_DOUBLE,
       CartToRank(myrow, mycol-myrow, npdim),
       1, MPI_COMM_WORLD, &status);
  memcpy(a, wspace, nlocal*nlocal*sizeof(double));

  MPI_Sendrecv(b, nlocal*nlocal, MPI_DOUBLE,
       CartToRank(myrow+mycol, mycol, npdim),
       1, wspace, nlocal*nlocal, MPI_DOUBLE,
       CartToRank(myrow-mycol, mycol, npdim),
       1, MPI_COMM_WORLD, &status);
  memcpy(b, wspace, nlocal*nlocal*sizeof(double));

  //算法完成
  if (myrank == 0)
   printf("Done multiplying the matrices!\n");

  free(a);
  free(b);
  free(c);
  free(wspace);

  MPI_Finalize();
  }