#1
  1. Contributing User
    Devshed Demi-God (4500 - 4999 posts)

    Join Date
    Aug 2011
    Posts
    4,837
    Rep Power
    480

    Parallel code gives different answer


    Compiled with

    gfortran -std=f2008 -Wall -fopenmp -ffree-form -fall-intrinsics g.f08 -o g

    The output isn't always correct with 2 threads. Which regions of the code aren't blocked correctly? Search for
    !$OMP

    Code:
    ! compute permanent or determinant
    program f
    
      implicit none
      integer, parameter :: N = 11
      real, dimension(3,3) :: j, m
      real, dimension(N,N) :: timeme
      integer :: i, k
      data j/ 2,-1, 1,-1,-2, 1,-1,-1,-1/
      data m/ 2, 9, 4, 7, 5, 3, 6, 1, 8/
    
      open(unit=7,file='/tmp/a')
      read(7,'(11f8.4)')((timeme(i, k),i=1,N), k=1,N)
      close(7)
      write(6,*) 'j example, determinant: ',det(j,3,-1)
      write(6,*) 'j example, permanent:   ',det(j,3,1)
      write(6,*) 'maxima, determinant:    ',det(m,3,-1)
      write(6,*) 'maxima, permanent:      ',det(m,3,1)
      write(6,*) 'parallel timing, det:   ',det(timeme, N, -1)
    
    contains
    
      function det(a,n,permanent) result(accumulation)
        integer, intent(in) :: n, permanent
        real, dimension(n,n), intent(in) :: a
        real, dimension(n-1, n-1) :: b
        real :: accumulation, cofactor
        integer :: i, sgn
        if (n .eq. 1) then
          accumulation = a(1,1)
        else
    !$OMP PARALLEL PRIVATE(i, b, sgn, cofactor) SHARED(n, a, accumulation, permanent)
          accumulation = 0
          sgn = 1
    !$OMP DO
          do i=1, n
            b(:, :(i-1)) = a(2:, :i-1)
            b(:, i:) = a(2:, i+1:)
            cofactor = sgn * a(1, i) * detBackEnd(b, n-1, permanent)
            sgn = sgn * permanent
    !$OMP CRITICAL
            accumulation = accumulation + cofactor
    !$OMP END CRITICAL
          enddo
    !$OMP END DO
    !$OMP END PARALLEL
        endif
        
      end function det
    
      recursive function detBackEnd(a,n,permanent) result(accumulation)
        integer, intent(in) :: n, permanent
        real, dimension(n,n), intent(in) :: a
        real, dimension(n-1, n-1) :: b
        real :: accumulation
        integer :: i, sgn
        if (n .eq. 1) then
          accumulation = a(1,1)
        else
          accumulation = 0
          sgn = 1
          do i=1, n
            b(:, :(i-1)) = a(2:, :i-1)
            b(:, i:) = a(2:, i+1:)
            accumulation = accumulation + sgn * a(1, i) * detBackEnd(b, n-1, permanent)
            sgn = sgn * permanent
          enddo
        endif
      end function detBackEnd
    
    end program f
    [code]Code tags[/code] are essential for python code and Makefiles!
  2. #2
  3. Contributing User
    Devshed Demi-God (4500 - 4999 posts)

    Join Date
    Aug 2011
    Posts
    4,837
    Rep Power
    480

    gcc c translation appears to work.


    Code:
    #if 0
      -*- mode: compilation; default-directory: "/tmp/" -*-
      Compilation started at Tue Jul 16 23:19:47
    
      cc -fopenmp -Wall -O3 -g c.c -o c && OMP_NUM_THREADS=2 time $a ./c
      serial and parallel determinant computations
        Expect: 3.48913
        Serial: 3.48912
      sleep 5 seconds to mark your cpu monitor
      Parallel: 3.48912
    
      	speedup: 76%  ratio of parallel to single thread time.
      123.11user 0.09system 1:32.56elapsed 133%CPU (0avgtext+0avgdata 780maxresident)k
      0inputs+0outputs (0major+247minor)pagefaults 0swaps
    
      Compilation finished at Tue Jul 16 23:21:20
    #endif
    
    
    
    #include<stdlib.h>
    #include<string.h>
    #include<unistd.h>
    #include<stdio.h>
    #include<time.h>
    
    void error(int status, char*message) {
      fprintf(stderr,"\n%s\n",message);
      exit(status);
    }
    
    void*dwlcalloc(int n,size_t bytes) {
      void*rv = (void*)calloc(n,bytes);
      if (NULL == rv)
        error(1, "memory allocation failure");
      return rv;
    }
    
    void dwlfree(void**ppv) {
      if ((NULL != ppv) && (NULL != *ppv))
        free(*ppv), *ppv = NULL;
    }
    
    void*allocarray(size_t rank,size_t*shape,size_t itemSize) {
      /*
         Allocates arbitrary dimensional arrays (and inits all pointers)
         with only 1 call to malloc.  Lambert Electronics, USA, NY.
         This is wonderful because one only need call free once to deallocate
         the space.  Special routines for each size array are not need for
         allocation of for deallocation.  Also calls to malloc might be expensive
         because they might have to place operating system requests.  One call
         seems optimal.
      */
      size_t size,i,j,dataSpace,pointerSpace,pointers,nextLevelIncrement;
      char*memory,*pc,*nextpc;
      if (rank < 2) {
        if (rank < 0)
          error(1,"invalid negative rank argument passed to allocarray");
        size = rank < 1 ? 1 : *shape;
        return dwlcalloc(size,itemSize);
      }
      pointerSpace = 0, dataSpace = 1;
      for (i = 0; i < rank-1; ++i)
        pointerSpace += (dataSpace *= shape[i]);
      pointerSpace *= sizeof(char*);
      dataSpace *= shape[i]*itemSize;
      memory = pc = dwlcalloc(1,pointerSpace+dataSpace);
      pointers = 1;
      for (i = 0; i < rank-2; ) {
        nextpc = pc + (pointers *= shape[i])*sizeof(char*);
        nextLevelIncrement = shape[++i]*sizeof(char*);
        for (j = 0; j < pointers; ++j)
          *((char**)pc) = nextpc, pc+=sizeof(char*), nextpc += nextLevelIncrement;
      }
      nextpc = pc + (pointers *= shape[i])*sizeof(char*);
      nextLevelIncrement = shape[++i]*itemSize;
      for (j = 0; j < pointers; ++j)
        *((char**)pc) = nextpc, pc+=sizeof(char*), nextpc += nextLevelIncrement;
      return memory;
    }
    
    #define ODD(A) (1 & (A))
    
    float sign[] = {1, -1};
    
    float detser(float**a, int n) {
      float**b, sum;
      size_t rank, shape[2], i, j, k;
      if (1 == n)
        return **a;
      rank = 2;
      shape[0] = shape[1] = n-1;
      b = allocarray(rank, shape, sizeof(float));
      sum = 0;
      for (k = 0; k < n; ++k) {
        for (j = 1; j < n; ++j) {
          for (i = 0; i < k; ++i)
    	b[j-1][i] = a[j][i];
          for (i = k+1; i < n; ++i)
    	b[j-1][i-1] = a[j][i];
        }
        sum += sign[ODD(k)] * a[0][k] * detser(b, n-1);
      }
      dwlfree((void**)(&b));
      return sum;
    }
    
    float detpar(float**a, int n) {
      float***b, *local_det, sum;
      size_t rank, shape[3], i, j, k;
      if (1 == n)
        return **a;
      rank = 3;
      shape[0] = n, shape[1] = shape[2] = n-1;
      b = allocarray(rank, shape, sizeof(float));
      local_det = allocarray(1, shape, sizeof(float));
    # pragma omp parallel for default(shared) private(i,j,k)
      for (k = 0; k < n; ++k) {
        for (j = 1; j < n; ++j) {
          for (i = 0; i < k; ++i)
    	b[k][j-1][i] = a[j][i];
          for (i = k+1; i < n; ++i)
    	b[k][j-1][i-1] = a[j][i];
        }
        local_det[k] = detser(b[k], n-1);
      }
      sum = 0;
      for (k = 0; k < n; ++k)
        sum += sign[ODD(k)] * a[0][k] * local_det[k];
      dwlfree((void**)(&b)), dwlfree((void**)&local_det);
      return sum;
    }
    
    /* note that the data is taken left to right, not as a square matrix cut out from top left */
    
    float data[] = {
      -0.923274,  -0.341432,-0.328711,  0.971943,-0.883249,-0.435348, 0.383719,  0.45886,  0.541108, -0.997454, -0.166181,-0.624975,
      -0.506547,   0.791247, 0.514168, -0.810503, 0.186398,-0.761007,-0.730399,-0.215569, -0.521056,   0.47368,  0.192582, 0.924478,
      0.0577579,    0.17293, 0.752496,  0.663678,-0.516617,-0.119594,0.0427356,-0.122713, -0.689729, -0.202648, -0.998568,-0.723365,
       -0.56225,-0.00425167, 0.662194,-0.0391025, 0.218974, 0.443948,  0.99204, 0.492961, -0.202468, -0.175486,  0.786253,0.0606093,
       0.240993,  -0.757153, -0.79544, 0.0173647,-0.929971,-0.894012, 0.235547,-0.517109,  0.182032,  -0.90274, -0.852879, 0.212104,
       0.614065,   0.030626,-0.286679,  0.908969, 0.604646, 0.720185,-0.244579,-0.178661, -0.950055,-0.0120252, -0.105445,0.0531578,
       0.533407,   0.174958, 0.903334,  0.156381, 0.543367,-0.762282,-0.428918, 0.709452, -0.252296, -0.487895,  -0.14029, 0.264112,
      -0.194226, -0.0715186, 0.602734, -0.633329, 0.648061,-0.377232,  0.76431,-0.774184,    0.2458, -0.563082,  -0.52861, 0.435124,
       0.426403,  -0.965024,-0.723339, -0.279136,-0.667545,-0.677537, 0.634901, 0.557254, -0.235998, -0.999848,-0.0394405, 0.378694,
        0.34791,   0.921768, 0.654681,  0.660782, 0.598307,0.0473692,-0.838722, 0.414236, -0.390647,  0.267905,  0.231926, 0.977179,
        0.60767,  -0.481669, 0.405516, -0.158412, -0.48456,0.0750193, 0.932425,0.0998887,-0.0409722,  0.273858, -0.716461, 0.488875,
       0.297553,  -0.252373, -0.02735,  0.595922, 0.919663, 0.714662, 0.963793, 0.345256,  0.538883,  0.301958,  0.923339, 0.162875
    };
    
    int main() {
      time_t t[4];
      size_t rank, shape[2];
      float**a;
      rank = 2, shape[0] = shape[1] = 12;
      a = allocarray(rank, shape, sizeof(float));
      memcpy(*a, data, shape[0]*shape[1]*sizeof(float));
      puts("serial and parallel determinant computations");
      puts("  Expect: 3.48913");	/* 10x10 -4.85314"); */ /* note: expected value computed by efficient algorithm */
      t[0] = time(NULL), printf("  Serial: %g\n",detser(a, shape[0])), t[1] = time(NULL);
      puts("sleep 5 seconds to mark your cpu monitor");
      sleep(5);
      t[2] = time(NULL), printf("Parallel: %g\n",detpar(a, shape[0])), t[3] = time(NULL);
      printf("\n\tspeedup: %d%%  ratio of parallel to single thread time.\n", (int)(0.5 + 100.0*(t[3]-t[2])/(t[1]-t[0])));
      dwlfree((void**)(&a));
      return EXIT_SUCCESS;
    }
    Next, carefully implement the FORTRAN version to work like this.
    [code]Code tags[/code] are essential for python code and Makefiles!
  4. #3
  5. Contributing User
    Devshed Demi-God (4500 - 4999 posts)

    Join Date
    Aug 2011
    Posts
    4,837
    Rep Power
    480
    Code:
    ! compute permanent or determinant
    program f
    
      implicit none
      integer, parameter :: N = 12
      real, dimension(3,3) :: j, m
      real, dimension(N,N) :: timeme
      real, dimension(4) :: seconds
      integer :: i, k
      data j/ 2,-1, 1,-1,-2, 1,-1,-1,-1/
      data m/ 2, 9, 4, 7, 5, 3, 6, 1, 8/
    
      open(unit=7,file='/tmp/a')
      read(7,*)((timeme(i, k),i=1,N), k=1,N)
      close(7)
      write(6,*) 'j example, determinant: ',detser(j,3,-1)
      write(6,*) 'j example, permanent:   ',detser(j,3,1)
      write(6,*) 'maxima, determinant:    ',detser(m,3,-1)
      write(6,*) 'maxima, permanent:      ',detser(m,3,1)
      seconds(1) = mytime()
      write(6,*) 'serial timing, det:     ',detser(timeme, N, -1)
      seconds(2) = mytime()
      write(6,*) 'j example, determinant: ',detpar(j,3,-1)
      write(6,*) 'j example, permanent:   ',detpar(j,3,1)
      write(6,*) 'maxima, determinant:    ',detpar(m,3,-1)
      write(6,*) 'maxima, permanent:      ',detpar(m,3,1)
      call sleep(5)
      seconds(3) = mytime()
      write(6,*) 'parallel timing, det:   ',detpar(timeme, N, -1)
      seconds(4) = mytime()
      write(6,*) '      ratio parallel/serial:',ifix(0.5+100*(seconds(4)-seconds(3))/(seconds(2)-seconds(1))),'%'
    
    contains
    
      recursive function detpar(a,n,permanent) result(accumulation)
        integer, intent(in) :: n, permanent
        real, dimension(n,n), intent(in) :: a
        real, dimension(n, n-1, n-1) :: b
        real, dimension(n) :: local_det
        real :: accumulation
        integer :: j, k, sgn
        if (n .eq. 1) then
           accumulation = a(1,1)
        else
    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(j,k)
           do k=1,n
              do j = 1, k-1
                 b(k, :, j) = a(2:, j)
              end do
              do j = k+1, n
                 b(k, :, j-1) = a(2:, j)
              end do
              local_det(k) = detser(b(k,:,:), n-1, permanent)
           enddo
           accumulation = 0
           sgn = 1
           do k=1,n
              accumulation = accumulation + sgn * a(1, k) * local_det(k)
              sgn = sgn * permanent
           end do
        endif
      end function detpar
    
      recursive function detser(a,n,permanent) result(accumulation)
        integer, intent(in) :: n, permanent
        real, dimension(n,n), intent(in) :: a
        real, dimension(n-1, n-1) :: b
        real :: accumulation
        integer :: j, k, sgn
        if (n .eq. 1) then
           accumulation = a(1,1)
        else
           accumulation = 0
           sgn = 1
           do k=1, n
              do j = 1, k-1
                 b(:, j) = a(2:, j)
              end do
              do j = k+1, n
                 b(:, j-1) = a(2:, j)
              end do
              accumulation = accumulation + sgn * a(1, k) * detser(b, n-1, permanent)
              sgn = sgn * permanent
           enddo
        endif
      end function detser
    
      real function mytime()
        integer(kind=4), dimension(3) :: TArray
        call itime(TArray)
        mytime = real(TArray(3) + 60 * (TArray(2) + 60 * (TArray(1))))
      end function mytime
    
    end program f
    This runs both cores together and gives correct answer. Is it faster?
    The file /tmp/a contains
    Code:
    -0.923274   -0.341432 -0.328711   0.971943 -0.883249 -0.435348  0.383719   0.45886   0.541108  -0.997454  -0.166181 -0.624975
    -0.506547    0.791247  0.514168  -0.810503  0.186398 -0.761007 -0.730399 -0.215569  -0.521056    0.47368   0.192582  0.924478
    0.0577579     0.17293  0.752496   0.663678 -0.516617 -0.119594 0.0427356 -0.122713  -0.689729  -0.202648  -0.998568 -0.723365
     -0.56225 -0.00425167  0.662194 -0.0391025  0.218974  0.443948   0.99204  0.492961  -0.202468  -0.175486   0.786253 0.0606093
     0.240993   -0.757153  -0.79544  0.0173647 -0.929971 -0.894012  0.235547 -0.517109   0.182032   -0.90274  -0.852879  0.212104
     0.614065    0.030626 -0.286679   0.908969  0.604646  0.720185 -0.244579 -0.178661  -0.950055 -0.0120252  -0.105445 0.0531578
     0.533407    0.174958  0.903334   0.156381  0.543367 -0.762282 -0.428918  0.709452  -0.252296  -0.487895   -0.14029  0.264112
    -0.194226  -0.0715186  0.602734  -0.633329  0.648061 -0.377232   0.76431 -0.774184     0.2458  -0.563082   -0.52861  0.435124
     0.426403   -0.965024 -0.723339  -0.279136 -0.667545 -0.677537  0.634901  0.557254  -0.235998  -0.999848 -0.0394405  0.378694
      0.34791    0.921768  0.654681   0.660782  0.598307 0.0473692 -0.838722  0.414236  -0.390647   0.267905   0.231926  0.977179
      0.60767   -0.481669  0.405516  -0.158412  -0.48456 0.0750193  0.932425 0.0998887 -0.0409722   0.273858  -0.716461  0.488875
     0.297553   -0.252373  -0.02735   0.595922  0.919663  0.714662  0.963793  0.345256   0.538883   0.301958   0.923339  0.162875
    Last edited by b49P23TIvg; July 17th, 2013 at 07:34 AM.
    [code]Code tags[/code] are essential for python code and Makefiles!
  6. #4
  7. Contributing User
    Devshed Demi-God (4500 - 4999 posts)

    Join Date
    Aug 2011
    Posts
    4,837
    Rep Power
    480

    For completeness...


    Using a 4 core laptop computer, pgfortran and gfortran behave similarly with this program.

    pgfortran
    Code:
    $ make g
    "c:/Program Files/PGI/win64/13.6/bin/pgfortran.exe" -I../include "-Ic:\Program Files (x86)\PGI\win32\13.6\include" -Minform=warn -tp=px-32 -Mbackslash -mp -Lc:\PROGRA~2\PGI\win32\13.6\lib g.f90 -lstd -o g
    $ OMP_NUM_THREADS=4 g
     j example, determinant:     7.000000    
     j example, permanent:       5.000000    
     maxima, determinant:       -360.0000    
     maxima, permanent:          900.0000    
     serial timing, det:         3.489127    
     j example, determinant:     7.000000    
     j example, permanent:       5.000000    
     maxima, determinant:       -360.0000    
     maxima, permanent:          900.0000    
     parallel timing, det:       3.489127    
           ratio parallel/serial:           51 %
    $
    gfortran
    Code:
    $ gfortran -m32 -Wall -fopenmp g.f90 
    $ OMP_NUM_THREADS=4 g
     j example, determinant:     7.000000    
     j example, permanent:       5.000000    
     maxima, determinant:       -360.0000    
     maxima, permanent:          900.0000    
     serial timing, det:         3.489127    
     j example, determinant:     7.000000    
     j example, permanent:       5.000000    
     maxima, determinant:       -360.0000    
     maxima, permanent:          900.0000    
     parallel timing, det:       3.489127    
           ratio parallel/serial:           50 %
    $
    [code]Code tags[/code] are essential for python code and Makefiles!

IMN logo majestic logo threadwatch logo seochat tools logo