OPENMP实现多线程计算,提高运行速度

多线程编程

Posted by VK on August 30, 2019

OPENMP实现多线程计算,提高运行速度

如下是测试的代码:

//OpenMP version.  Edit and submit only this file.
/* Enter your details below
 * Name :
 * UCLA ID :
 * Email :
 */

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>

#include "utils.h"

void work_it_par(long *old, long *new) {
  int i, j, k;
  int u, v, w;
  long compute_it;
  long aggregate=1.0;

  for (i=1; i<DIM-1; i++) {
    for (j=1; j<DIM-1; j++) {
      for (k=1; k<DIM-1; k++) {
        compute_it = old[i*DIM*DIM+j*DIM+k] * we_need_the_func();
        aggregate+= compute_it / gimmie_the_func();
      }
    }
  }

  printf("AGGR:%ld\n",aggregate);

  for (i=1; i<DIM-1; i++) {
    for (j=1; j<DIM-1; j++) {
      for (k=1; k<DIM-1; k++) {
        new[i*DIM*DIM+j*DIM+k]=0;
        for (u=-1; u<=1; u++) {
          for (v=-1; v<=1; v++) {
            for (w=-1; w<=1; w++) {
               new[i*DIM*DIM+j*DIM+k]+=old[(i+u)*DIM*DIM+(j+v)*DIM+(k+w)];
            }
          }
        }
        new[i*DIM*DIM+j*DIM+k]/=27;
      }
    }
  }

  for (i=1; i<DIM-1; i++) {
    for (j=1; j<DIM-1; j++) {
      for (k=1; k<DIM-1; k++) {
        u=(new[i*DIM*DIM+j*DIM+k]/100);
        if (u<=0) u=0;
        if (u>=9) u=9;
        histogrammy[u]++;
      }
    }
    }
}

这段代码在4CPU, 16G的电脑上,需要跑大概6秒钟,而使用openmp处理后,只需要1.7秒

修改后的代码如下:

//OpenMP version.  Edit and submit only this file.
/* Enter your details below
 * Name :
 * UCLA ID :
 * Email :
 */

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>

#include "utils.h"

void work_it_par(long *old, long *new) {
  int i, j, k;
  int u, v, w;
  long compute_it;
  long aggregate=1.0;
#pragma opm parallel for private(i,j,k) reduction(+:aggregate)
  for (i=1; i<DIM-1; i++) {
    for (j=1; j<DIM-1; j++) {
      for (k=1; k<DIM-1; k++) {

        compute_it = old[i*DIM*DIM+j*DIM+k] * we_need_the_func();
        aggregate+= compute_it / gimmie_the_func();
      }
    }
  }

  printf("AGGR:%ld\n",aggregate);
#pragma opm parallel for private(i,j,k,u,v,w) reduction(+:temp)
  for (i=1; i<DIM-1; i++) {

    for (j=1; j<DIM-1; j++) {
      for (k=1; k<DIM-1; k++) {
      long temp = 0;
  //         printf("i = %d j= %d k = %d\n",i,j,k);
      //  new[i*DIM*DIM+j*DIM+k]=0;
        for (u=-1; u<=1; u++) {
          for (v=-1; v<=1; v++) {
            for (w=-1; w<=1; w++) {
              // new[i*DIM*DIM+j*DIM+k]+=old[(i+u)*DIM*DIM+(j+v)*DIM+(k+w)];
                temp += old[(i+u)*DIM*DIM+(j+v)*DIM+(k+w)];
}
          }
        }
        new[i*DIM*DIM+j*DIM+k]= temp/27;
      }
    }
  }
long h0 = histogrammy[0],h1=histogrammy[1],h2 = histogrammy[2],h3= histogrammy[3],h4= histogrammy[4],h5= histogrammy[5],h6= histogrammy[6],h7= histogrammy[7],h8= histogrammy[8],h9= histogrammy[9];
#pragma opm parallel for private(i,j,k,u) reduction(+:h0,h1,h2,h3,h4,h5,h6,h7,h8,h9)
//#pragma opm parallel for reduction(+:h0,h1,h2,h3,h4,h5,h6,h7,h8,h9)
   for (i=1; i<DIM-1; i++) {
    for (j=1; j<DIM-1; j++) {
      for (k=1; k<DIM-1; k++) {
        u=(new[i*DIM*DIM+j*DIM+k]/100);
        if (u<=0) u=0;
        if (u>=9) u=9;
        switch(u)
        {
            case 0:h0++;break;
            case 1:h1++;break;
            case 2:h2++;break;
            case 3:h3++;break;
            case 4:h4++;break;
            case 5:h5++;break;
            case 6:h6++;break;
            case 7:h7++;break;
            case 8:h8++;break;
            case 9:h9++;break;
        }
//        histogrammy[u]++;
      }
    }
    }
histogrammy[0] = h0;
histogrammy[1] = h1;
histogrammy[2] = h2;
histogrammy[3] = h3;
histogrammy[4] = h4;
histogrammy[5] = h5;
histogrammy[6] = h6;
histogrammy[7] = h7;
histogrammy[8] = h8;
histogrammy[9] = h9;

}