5 years ago · f4efa8ee66
--- a/report.md
+++ b/report.md
@@ -281,12 +281,12 @@ void MatMultAdd(const MatWrap& a, const MatWrap& b, MatWrap& c) {
 
				 | 512      | 16     | 0.784           |

			
 
				 | 1024     | 64     | 4.856           |

			
 
				 

			
 
				-| 问题规模 | 并行数 | 总运行时间(sec) |

			
 
				-| -------- | ------ | --------------- |

			
 
				-| 512      | 1      | 2.551           |

			
 
				-| 512      | 4      | 1.099           |

			
 
				-| 512      | 16     | 0.784           |

			
 
				-| 512      | 64     | 1.248           |

			
 
				+| 问题规模 | 并行数 | 总运行时间(sec) | 加速比 |

			
 
				+| -------- | ------ | --------------- | ------ |

			
 
				+| 512      | 1      | 2.551           | 1.00   |

			
 
				+| 512      | 4      | 1.099           | 2.32   |

			
 
				+| 512      | 16     | 0.784           | 3.25   |

			
 
				+| 512      | 64     | 1.248           | 2.04   |

			
 
				 

			
 
				 ### 参数服务器系统

			
 
				 

			
@@ -370,12 +370,12 @@ end do
 
				 | 1024     | 4        | 24         | 0.268           |

			
 
				 | 1024     | 4        | 32         | 0.292           |

			
 
				 

			
 
				-| 交换次数 | 服务器数 | 工作进程数 | 总运行时间(sec) |

			
 
				-| -------- | -------- | ---------- | --------------- |

			
 
				-| 1024     | 1        | 64         | 0.441           |

			
 
				-| 1024     | 2        | 64         | 1.496           |

			
 
				-| 1024     | 4        | 64         | 0.714           |

			
 
				-| 1024     | 8        | 64         | 0.684           |

			
 
				+| 交换次数 | 服务器数 | 工作进程数 | 总运行时间(sec) | 加速比 |

			
 
				+| -------- | -------- | ---------- | --------------- | ------ |

			
 
				+| 1024     | 1        | 64         | 0.441           | -      |

			
 
				+| 1024     | 2        | 64         | 1.496           | -      |

			
 
				+| 1024     | 4        | 64         | 0.714           | -      |

			
 
				+| 1024     | 8        | 64         | 0.684           | -      |

			
 
				 

			
 
				 从结果上看，当工作进程数增加时，服务器成比例增加也难以保证总运行时间，因为每次运行需要所有服务器间通信一次。

			
 
				 

			
@@ -583,12 +583,12 @@ void sigmoid(float *x, float *y, int n) {
 
				 | 256      | 4            | 6.681           |

			
 
				 | 512      | 8            | 46.498          |

			
 
				 

			
 
				-| 问题规模 | OpenMP线程数 | 总运行时间(sec) |

			
 
				-| -------- | ------------ | --------------- |

			
 
				-| 256      | 1            | 19.848          |

			
 
				-| 256      | 2            | 11.068          |

			
 
				-| 256      | 4            | 7.162           |

			
 
				-| 256      | 8            | 6.252           |

			
 
				+| 问题规模 | OpenMP线程数 | 总运行时间(sec) | 加速比 |

			
 
				+| -------- | ------------ | --------------- | ------ |

			
 
				+| 256      | 1            | 19.848          | 1.00   |

			
 
				+| 256      | 2            | 11.068          | 1.79   |

			
 
				+| 256      | 4            | 7.162           | 2.77   |

			
 
				+| 256      | 8            | 6.252           | 3.17   |

			
 
				 

			
 
				 ## 分组实验

			
 
				 

			
@@ -601,14 +601,21 @@ void sigmoid(float *x, float *y, int n) {
 
				 ### closure-MPI

			
 
				 

			
 
				 ##### 性能结果

			
 
				-

			
 
				+1

			
 
				 | 问题规模 | MPI进程数 | 总运行时间(sec) |

			
 
				 | -------- | --------- | --------------- |

			
 
				 | 64       | 1         | 0.122           |

			
 
				 | 128      | 4         | 0.267           |

			
 
				 | 256      | 16      | 0.276           |

			
 
				 | 512     | 64        | 4.923 |

			
 
				+2

			
 
				 

			
 
				+| 问题规模 | MPI进程数 | 总运行时间(sec) | 加速比 |

			
 
				+| -------- | --------- | --------------- | --------------- |

			
 
				+| 256     | 1         | 1.014      | 1.00  |

			
 
				+| 256    | 4         | 0.811      | 1.25 |

			
 
				+| 256      | 16      | 1.396      | 0.72  |

			
 
				+| 256   | 64        | 4.923 | 0.21 |

			
 
				 ##### 改进

			
 
				 

			
 
				 * 优化了数据传输，使用`MPI_Bcast, MPI_Scatter, MPI_Gather`替换了`for-MPI_Send-MPI_Recv`

			
@@ -620,16 +627,87 @@ void sigmoid(float *x, float *y, int n) {
 
				 

			
 
				 ### closure-Hybrid-omp-mpi

			
 
				 

			
 
				+##### MPI+OpenMP实现

			
 
				+

			
 
				+原有程序为MPI行划分实现，很难再在行内新增OpenMP并行。因而考虑重新设计并行方案。closure运算的本质是布尔矩阵乘，因而考虑使用分块并行矩阵乘法，使用MPI分块并行矩阵乘法，在每个子矩阵块乘法再使用OpenMP行划分矩阵并行乘法。

			
 
				+

			
 
				+###### 核心代码

			
 
				+

			
 
				+```c++

			
 
				+void MatMultAdd(const MatWrap& a, const MatWrap& b, MatWrap& c) {

			
 
				+  assert(a.n_ == b.n_);

			
 
				+  assert(a.n_ == c.n_);

			
 
				+  int n = a.n_;

			
 
				+#pragma omp parallel for num_threads(OMP_THREADS)

			
 
				+  for (int i = 0; i < n; ++i) {

			
 
				+    for (int j = 0; j < n; ++j) {

			
 
				+      for (int k = 0; k < n; ++k) {

			
 
				+        c[i][j] = static_cast<bool>(c[i][j] + a[i][k] * b[k][j]);

			
 
				+      }

			
 
				+    }

			
 
				+  }

			
 
				+}

			
 
				+```

			
 
				+

			
 
				+

			
 
				+

			
 
				+```c++

			
 
				+  for (int k = 0; k <= std::log2f(n); ++k) {

			
 
				+    for (int i = 0; i < sub_n; ++i) {

			
 
				+      for (int j = 0; j < sub_n; ++j) {

			
 
				+        sub_c[i][j] = 0;

			
 
				+      }

			
 
				+    }

			
 
				+    // broadcast sub matrix

			
 
				+    MPI_Scatter(mat_a, sub_n * sub_n, MPI_INT, sub_mat_a, sub_n * sub_n,

			
 
				+                MPI_INT, 0, MPI_COMM_WORLD);

			
 
				+    MPI_Scatter(mat_a, sub_n * sub_n, MPI_INT, sub_mat_b, sub_n * sub_n,

			
 
				+                MPI_INT, 0, MPI_COMM_WORLD);

			
 
				+    // split comm in col and row

			
 
				+    MPI_Comm col_world, row_world;

			
 
				+    int col_rank = rank % sqrt_q;

			
 
				+    int row_rank = rank / sqrt_q;

			
 
				+    MPI_Comm_split(MPI_COMM_WORLD, col_rank, row_rank, &col_world);

			
 
				+    MPI_Comm_split(MPI_COMM_WORLD, row_rank, col_rank, &row_world);

			
 
				+    // compute

			
 
				+    for (int i = 0; i < sqrt_q; ++i) {

			
 
				+      // broadcast sub_a

			
 
				+      int send_root = (row_rank + i) % sqrt_q;

			
 
				+      if (col_rank == (row_rank + i) % sqrt_q) {

			
 
				+        memcpy(sub_mat_comm, sub_mat_a, sub_n * sub_n * sizeof(int));

			
 
				+      }

			
 
				+      MPI_Bcast(sub_mat_comm, sub_n * sub_n, MPI_INT, send_root, row_world);

			
 
				+      // calculate sub mat gemm

			
 
				+      MatMultAdd(sub_comm, sub_b, sub_c);

			
 
				+      // swap sub_b

			
 
				+      MPI_Sendrecv_replace(

			
 
				+          sub_mat_b, sub_n * sub_n, MPI_INT, (row_rank + sqrt_q - 1) % sqrt_q,

			
 
				+          1, (row_rank + 1) % sqrt_q, 1, col_world, MPI_STATUS_IGNORE);

			
 
				+    }

			
 
				+    // gather result

			
 
				+    MPI_Gather(sub_mat_c, sub_n * sub_n, MPI_INT, mat_a, sub_n * sub_n, MPI_INT,

			
 
				+               0, MPI_COMM_WORLD);

			
 
				+    // print result

			
 
				+    if (rank == 0) {

			
 
				+      MatWrap mc(mat_a, n);

			
 
				+      printf("loop:%d\n", k);

			
 
				+      mc.print(true, sqrt_q);

			
 
				+    }

			
 
				+  }

			
 
				+```

			
 
				+

			
 
				+*完整代码见附件*

			
 
				+

			
 
				 ##### 性能结果

			
 
				 

			
 
				-| 问题规模 | OpenMP线程数 | MPI进程数 | 总运行时间(sec) |

			
 
				-| -------- | ------------ | --------- | --------------- |

			
 
				-| 64       | 1            | 1         | 0.122           |

			
 
				-| 64       | 2            | 1         | 0.090           |

			
 
				-| 64       | 4            | 1         | 0.067           |

			
 
				-| 256      | 1            | 4         | 1.255           |

			
 
				-| 256      | 2            | 4         | 0.834           |

			
 
				-| 256      | 4            | 4         | 0.647           |

			
 
				+| 问题规模 | OpenMP线程数 | MPI进程数 | 总运行时间(sec) | 加速比 |

			
 
				+| -------- | ------------ | --------- | --------------- | ------ |

			
 
				+| 64       | 1            | 1         | 0.122           | 1.00   |

			
 
				+| 64       | 2            | 1         | 0.090           | 1.36   |

			
 
				+| 64       | 4            | 1         | 0.067           | 1.82   |

			
 
				+| 256      | 1            | 4         | 1.255           | 1.00   |

			
 
				+| 256      | 2            | 4         | 0.834           | 1.50   |

			
 
				+| 256      | 4            | 4         | 0.647           | 1.94   |

			
 
				 

			
 
				 ### gauss-MPI

			
 
				 

			
@@ -646,13 +724,14 @@ void sigmoid(float *x, float *y, int n) {
 
				 | 8192     | 16     | 100.614923 | 16.937205    | 83.677718    |

			
 
				 

			
 
				 2

			
 
				-| 问题规模 | 任务数 | 总运行时间 | 分发数据时间 | 并行计算时间 |

			
 
				-| -------- | ------ | ---------- | ------------ | ------------ |

			
 
				-| 2048     | 1      | 5.461998   | 1.217323     | 4.244675     |

			
 
				-| 2048     | 2      | 4.983760   | 1.240886     | 3.742874     |

			
 
				-| 2048     | 4      | 4.045694   | 1.148780     | 2.896914     |

			
 
				-| 2048     | 8      | 4.017666   | 1.161650     | 2.856016     |

			
 
				-| 2048     | 16     | 4.135822   | 1.286142     | 2.849679     |

			
 
				+

			
 
				+| 问题规模 | 任务数 | 总运行时间 | 分发数据时间 | 并行计算时间 | 加速比 |

			
 
				+| -------- | ------ | ---------- | ------------ | ------------ | ------ |

			
 
				+| 2048     | 1      | 5.461998   | 1.217323     | 4.244675     | 1.00   |

			
 
				+| 2048     | 2      | 4.983760   | 1.240886     | 3.742874     | 1.13   |

			
 
				+| 2048     | 4      | 4.045694   | 1.148780     | 2.896914     | 1.47   |

			
 
				+| 2048     | 8      | 4.017666   | 1.161650     | 2.856016     | 1.49   |

			
 
				+| 2048     | 16     | 4.135822   | 1.286142     | 2.849679     | 1.48   |

			
 
				 

			
 
				 

			
 
				 ##### 改进

			
@@ -666,3 +745,28 @@ void sigmoid(float *x, float *y, int n) {
 
				 

			
 
				 ### fft-MPI

			
 
				 

			
 
				+##### 性能结果

			
 
				+1

			
 
				+| 问题规模 | MPI进程数 | 总运行时间(sec) |

			
 
				+| -------- | --------- | --------------- |

			
 
				+| 512      | 1         | 0.036836        |

			
 
				+| 1024     | 2         | 0.073674        |

			
 
				+| 2048     | 4         | 0.148747        |

			
 
				+| 4096     | 8         | 0.347353        |

			
 
				+2

			
 
				+

			
 
				+| 问题规模 | MPI进程数 | 总运行时间(sec) | 加速比 |

			
 
				+| -------- | --------- | --------------- | ------ |

			
 
				+| 4096     | 1         | 2.323001        | 1.00   |

			
 
				+| 4096     | 2         | 1.162240        | 2.00   |

			
 
				+| 4096     | 4         | 0.605259        | 3.84   |

			
 
				+| 4096     | 8         | 0.349958        | 6.64   |

			
 
				+

			
 
				+##### 改进

			
 
				+

			
 
				+* 使用Python，numpy生成输入数据

			
 
				+* 重定向输出，使用管道重定向输出

			
 
				+* 自动化性能测试脚本

			
 
				+* 修改了输入数量规模上限

			
 
				+

			
 
				+*完整代码见附件*