OPENMP实现多线程计算,提高运行速度
如下是测试的代码:
//OpenMP version. Edit and submit only this file.
/* Enter your details below
* Name :
* UCLA ID :
* Email :
*/
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#include "utils.h"
void work_it_par(long *old, long *new) {
int i, j, k;
int u, v, w;
long compute_it;
long aggregate=1.0;
for (i=1; i<DIM-1; i++) {
for (j=1; j<DIM-1; j++) {
for (k=1; k<DIM-1; k++) {
compute_it = old[i*DIM*DIM+j*DIM+k] * we_need_the_func();
aggregate+= compute_it / gimmie_the_func();
}
}
}
printf("AGGR:%ld\n",aggregate);
for (i=1; i<DIM-1; i++) {
for (j=1; j<DIM-1; j++) {
for (k=1; k<DIM-1; k++) {
new[i*DIM*DIM+j*DIM+k]=0;
for (u=-1; u<=1; u++) {
for (v=-1; v<=1; v++) {
for (w=-1; w<=1; w++) {
new[i*DIM*DIM+j*DIM+k]+=old[(i+u)*DIM*DIM+(j+v)*DIM+(k+w)];
}
}
}
new[i*DIM*DIM+j*DIM+k]/=27;
}
}
}
for (i=1; i<DIM-1; i++) {
for (j=1; j<DIM-1; j++) {
for (k=1; k<DIM-1; k++) {
u=(new[i*DIM*DIM+j*DIM+k]/100);
if (u<=0) u=0;
if (u>=9) u=9;
histogrammy[u]++;
}
}
}
}
这段代码在4CPU, 16G的电脑上,需要跑大概6秒钟,而使用openmp处理后,只需要1.7秒
修改后的代码如下:
//OpenMP version. Edit and submit only this file.
/* Enter your details below
* Name :
* UCLA ID :
* Email :
*/
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#include "utils.h"
void work_it_par(long *old, long *new) {
int i, j, k;
int u, v, w;
long compute_it;
long aggregate=1.0;
#pragma opm parallel for private(i,j,k) reduction(+:aggregate)
for (i=1; i<DIM-1; i++) {
for (j=1; j<DIM-1; j++) {
for (k=1; k<DIM-1; k++) {
compute_it = old[i*DIM*DIM+j*DIM+k] * we_need_the_func();
aggregate+= compute_it / gimmie_the_func();
}
}
}
printf("AGGR:%ld\n",aggregate);
#pragma opm parallel for private(i,j,k,u,v,w) reduction(+:temp)
for (i=1; i<DIM-1; i++) {
for (j=1; j<DIM-1; j++) {
for (k=1; k<DIM-1; k++) {
long temp = 0;
// printf("i = %d j= %d k = %d\n",i,j,k);
// new[i*DIM*DIM+j*DIM+k]=0;
for (u=-1; u<=1; u++) {
for (v=-1; v<=1; v++) {
for (w=-1; w<=1; w++) {
// new[i*DIM*DIM+j*DIM+k]+=old[(i+u)*DIM*DIM+(j+v)*DIM+(k+w)];
temp += old[(i+u)*DIM*DIM+(j+v)*DIM+(k+w)];
}
}
}
new[i*DIM*DIM+j*DIM+k]= temp/27;
}
}
}
long h0 = histogrammy[0],h1=histogrammy[1],h2 = histogrammy[2],h3= histogrammy[3],h4= histogrammy[4],h5= histogrammy[5],h6= histogrammy[6],h7= histogrammy[7],h8= histogrammy[8],h9= histogrammy[9];
#pragma opm parallel for private(i,j,k,u) reduction(+:h0,h1,h2,h3,h4,h5,h6,h7,h8,h9)
//#pragma opm parallel for reduction(+:h0,h1,h2,h3,h4,h5,h6,h7,h8,h9)
for (i=1; i<DIM-1; i++) {
for (j=1; j<DIM-1; j++) {
for (k=1; k<DIM-1; k++) {
u=(new[i*DIM*DIM+j*DIM+k]/100);
if (u<=0) u=0;
if (u>=9) u=9;
switch(u)
{
case 0:h0++;break;
case 1:h1++;break;
case 2:h2++;break;
case 3:h3++;break;
case 4:h4++;break;
case 5:h5++;break;
case 6:h6++;break;
case 7:h7++;break;
case 8:h8++;break;
case 9:h9++;break;
}
// histogrammy[u]++;
}
}
}
histogrammy[0] = h0;
histogrammy[1] = h1;
histogrammy[2] = h2;
histogrammy[3] = h3;
histogrammy[4] = h4;
histogrammy[5] = h5;
histogrammy[6] = h6;
histogrammy[7] = h7;
histogrammy[8] = h8;
histogrammy[9] = h9;
}