1. #### Parallel code gives different answer

Compiled with

gfortran -std=f2008 -Wall -fopenmp -ffree-form -fall-intrinsics g.f08 -o g

The output isn't always correct with 2 threads. Which regions of the code aren't blocked correctly? Search for
!\$OMP

Code:
```! compute permanent or determinant
program f

implicit none
integer, parameter :: N = 11
real, dimension(3,3) :: j, m
real, dimension(N,N) :: timeme
integer :: i, k
data j/ 2,-1, 1,-1,-2, 1,-1,-1,-1/
data m/ 2, 9, 4, 7, 5, 3, 6, 1, 8/

open(unit=7,file='/tmp/a')
close(7)
write(6,*) 'j example, determinant: ',det(j,3,-1)
write(6,*) 'j example, permanent:   ',det(j,3,1)
write(6,*) 'maxima, determinant:    ',det(m,3,-1)
write(6,*) 'maxima, permanent:      ',det(m,3,1)
write(6,*) 'parallel timing, det:   ',det(timeme, N, -1)

contains

function det(a,n,permanent) result(accumulation)
integer, intent(in) :: n, permanent
real, dimension(n,n), intent(in) :: a
real, dimension(n-1, n-1) :: b
real :: accumulation, cofactor
integer :: i, sgn
if (n .eq. 1) then
accumulation = a(1,1)
else
!\$OMP PARALLEL PRIVATE(i, b, sgn, cofactor) SHARED(n, a, accumulation, permanent)
accumulation = 0
sgn = 1
!\$OMP DO
do i=1, n
b(:, :(i-1)) = a(2:, :i-1)
b(:, i:) = a(2:, i+1:)
cofactor = sgn * a(1, i) * detBackEnd(b, n-1, permanent)
sgn = sgn * permanent
!\$OMP CRITICAL
accumulation = accumulation + cofactor
!\$OMP END CRITICAL
enddo
!\$OMP END DO
!\$OMP END PARALLEL
endif

end function det

recursive function detBackEnd(a,n,permanent) result(accumulation)
integer, intent(in) :: n, permanent
real, dimension(n,n), intent(in) :: a
real, dimension(n-1, n-1) :: b
real :: accumulation
integer :: i, sgn
if (n .eq. 1) then
accumulation = a(1,1)
else
accumulation = 0
sgn = 1
do i=1, n
b(:, :(i-1)) = a(2:, :i-1)
b(:, i:) = a(2:, i+1:)
accumulation = accumulation + sgn * a(1, i) * detBackEnd(b, n-1, permanent)
sgn = sgn * permanent
enddo
endif
end function detBackEnd

end program f```
2. #### gcc c translation appears to work.

Code:
```#if 0
-*- mode: compilation; default-directory: "/tmp/" -*-
Compilation started at Tue Jul 16 23:19:47

cc -fopenmp -Wall -O3 -g c.c -o c && OMP_NUM_THREADS=2 time \$a ./c
serial and parallel determinant computations
Expect: 3.48913
Serial: 3.48912
sleep 5 seconds to mark your cpu monitor
Parallel: 3.48912

speedup: 76%  ratio of parallel to single thread time.
123.11user 0.09system 1:32.56elapsed 133%CPU (0avgtext+0avgdata 780maxresident)k
0inputs+0outputs (0major+247minor)pagefaults 0swaps

Compilation finished at Tue Jul 16 23:21:20
#endif

#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<stdio.h>
#include<time.h>

void error(int status, char*message) {
fprintf(stderr,"\n%s\n",message);
exit(status);
}

void*dwlcalloc(int n,size_t bytes) {
void*rv = (void*)calloc(n,bytes);
if (NULL == rv)
error(1, "memory allocation failure");
return rv;
}

void dwlfree(void**ppv) {
if ((NULL != ppv) && (NULL != *ppv))
free(*ppv), *ppv = NULL;
}

void*allocarray(size_t rank,size_t*shape,size_t itemSize) {
/*
Allocates arbitrary dimensional arrays (and inits all pointers)
with only 1 call to malloc.  Lambert Electronics, USA, NY.
This is wonderful because one only need call free once to deallocate
the space.  Special routines for each size array are not need for
allocation of for deallocation.  Also calls to malloc might be expensive
because they might have to place operating system requests.  One call
seems optimal.
*/
size_t size,i,j,dataSpace,pointerSpace,pointers,nextLevelIncrement;
char*memory,*pc,*nextpc;
if (rank < 2) {
if (rank < 0)
error(1,"invalid negative rank argument passed to allocarray");
size = rank < 1 ? 1 : *shape;
return dwlcalloc(size,itemSize);
}
pointerSpace = 0, dataSpace = 1;
for (i = 0; i < rank-1; ++i)
pointerSpace += (dataSpace *= shape[i]);
pointerSpace *= sizeof(char*);
dataSpace *= shape[i]*itemSize;
memory = pc = dwlcalloc(1,pointerSpace+dataSpace);
pointers = 1;
for (i = 0; i < rank-2; ) {
nextpc = pc + (pointers *= shape[i])*sizeof(char*);
nextLevelIncrement = shape[++i]*sizeof(char*);
for (j = 0; j < pointers; ++j)
*((char**)pc) = nextpc, pc+=sizeof(char*), nextpc += nextLevelIncrement;
}
nextpc = pc + (pointers *= shape[i])*sizeof(char*);
nextLevelIncrement = shape[++i]*itemSize;
for (j = 0; j < pointers; ++j)
*((char**)pc) = nextpc, pc+=sizeof(char*), nextpc += nextLevelIncrement;
return memory;
}

#define ODD(A) (1 & (A))

float sign[] = {1, -1};

float detser(float**a, int n) {
float**b, sum;
size_t rank, shape[2], i, j, k;
if (1 == n)
return **a;
rank = 2;
shape[0] = shape[1] = n-1;
b = allocarray(rank, shape, sizeof(float));
sum = 0;
for (k = 0; k < n; ++k) {
for (j = 1; j < n; ++j) {
for (i = 0; i < k; ++i)
b[j-1][i] = a[j][i];
for (i = k+1; i < n; ++i)
b[j-1][i-1] = a[j][i];
}
sum += sign[ODD(k)] * a[0][k] * detser(b, n-1);
}
dwlfree((void**)(&b));
return sum;
}

float detpar(float**a, int n) {
float***b, *local_det, sum;
size_t rank, shape[3], i, j, k;
if (1 == n)
return **a;
rank = 3;
shape[0] = n, shape[1] = shape[2] = n-1;
b = allocarray(rank, shape, sizeof(float));
local_det = allocarray(1, shape, sizeof(float));
# pragma omp parallel for default(shared) private(i,j,k)
for (k = 0; k < n; ++k) {
for (j = 1; j < n; ++j) {
for (i = 0; i < k; ++i)
b[k][j-1][i] = a[j][i];
for (i = k+1; i < n; ++i)
b[k][j-1][i-1] = a[j][i];
}
local_det[k] = detser(b[k], n-1);
}
sum = 0;
for (k = 0; k < n; ++k)
sum += sign[ODD(k)] * a[0][k] * local_det[k];
dwlfree((void**)(&b)), dwlfree((void**)&local_det);
return sum;
}

/* note that the data is taken left to right, not as a square matrix cut out from top left */

float data[] = {
-0.923274,  -0.341432,-0.328711,  0.971943,-0.883249,-0.435348, 0.383719,  0.45886,  0.541108, -0.997454, -0.166181,-0.624975,
-0.506547,   0.791247, 0.514168, -0.810503, 0.186398,-0.761007,-0.730399,-0.215569, -0.521056,   0.47368,  0.192582, 0.924478,
0.0577579,    0.17293, 0.752496,  0.663678,-0.516617,-0.119594,0.0427356,-0.122713, -0.689729, -0.202648, -0.998568,-0.723365,
-0.56225,-0.00425167, 0.662194,-0.0391025, 0.218974, 0.443948,  0.99204, 0.492961, -0.202468, -0.175486,  0.786253,0.0606093,
0.240993,  -0.757153, -0.79544, 0.0173647,-0.929971,-0.894012, 0.235547,-0.517109,  0.182032,  -0.90274, -0.852879, 0.212104,
0.614065,   0.030626,-0.286679,  0.908969, 0.604646, 0.720185,-0.244579,-0.178661, -0.950055,-0.0120252, -0.105445,0.0531578,
0.533407,   0.174958, 0.903334,  0.156381, 0.543367,-0.762282,-0.428918, 0.709452, -0.252296, -0.487895,  -0.14029, 0.264112,
-0.194226, -0.0715186, 0.602734, -0.633329, 0.648061,-0.377232,  0.76431,-0.774184,    0.2458, -0.563082,  -0.52861, 0.435124,
0.426403,  -0.965024,-0.723339, -0.279136,-0.667545,-0.677537, 0.634901, 0.557254, -0.235998, -0.999848,-0.0394405, 0.378694,
0.34791,   0.921768, 0.654681,  0.660782, 0.598307,0.0473692,-0.838722, 0.414236, -0.390647,  0.267905,  0.231926, 0.977179,
0.60767,  -0.481669, 0.405516, -0.158412, -0.48456,0.0750193, 0.932425,0.0998887,-0.0409722,  0.273858, -0.716461, 0.488875,
0.297553,  -0.252373, -0.02735,  0.595922, 0.919663, 0.714662, 0.963793, 0.345256,  0.538883,  0.301958,  0.923339, 0.162875
};

int main() {
time_t t[4];
size_t rank, shape[2];
float**a;
rank = 2, shape[0] = shape[1] = 12;
a = allocarray(rank, shape, sizeof(float));
memcpy(*a, data, shape[0]*shape[1]*sizeof(float));
puts("serial and parallel determinant computations");
puts("  Expect: 3.48913");	/* 10x10 -4.85314"); */ /* note: expected value computed by efficient algorithm */
t[0] = time(NULL), printf("  Serial: %g\n",detser(a, shape[0])), t[1] = time(NULL);
puts("sleep 5 seconds to mark your cpu monitor");
sleep(5);
t[2] = time(NULL), printf("Parallel: %g\n",detpar(a, shape[0])), t[3] = time(NULL);
printf("\n\tspeedup: %d%%  ratio of parallel to single thread time.\n", (int)(0.5 + 100.0*(t[3]-t[2])/(t[1]-t[0])));
dwlfree((void**)(&a));
return EXIT_SUCCESS;
}```
Next, carefully implement the FORTRAN version to work like this.
3. Code:
```! compute permanent or determinant
program f

implicit none
integer, parameter :: N = 12
real, dimension(3,3) :: j, m
real, dimension(N,N) :: timeme
real, dimension(4) :: seconds
integer :: i, k
data j/ 2,-1, 1,-1,-2, 1,-1,-1,-1/
data m/ 2, 9, 4, 7, 5, 3, 6, 1, 8/

open(unit=7,file='/tmp/a')
close(7)
write(6,*) 'j example, determinant: ',detser(j,3,-1)
write(6,*) 'j example, permanent:   ',detser(j,3,1)
write(6,*) 'maxima, determinant:    ',detser(m,3,-1)
write(6,*) 'maxima, permanent:      ',detser(m,3,1)
seconds(1) = mytime()
write(6,*) 'serial timing, det:     ',detser(timeme, N, -1)
seconds(2) = mytime()
write(6,*) 'j example, determinant: ',detpar(j,3,-1)
write(6,*) 'j example, permanent:   ',detpar(j,3,1)
write(6,*) 'maxima, determinant:    ',detpar(m,3,-1)
write(6,*) 'maxima, permanent:      ',detpar(m,3,1)
call sleep(5)
seconds(3) = mytime()
write(6,*) 'parallel timing, det:   ',detpar(timeme, N, -1)
seconds(4) = mytime()
write(6,*) '      ratio parallel/serial:',ifix(0.5+100*(seconds(4)-seconds(3))/(seconds(2)-seconds(1))),'%'

contains

recursive function detpar(a,n,permanent) result(accumulation)
integer, intent(in) :: n, permanent
real, dimension(n,n), intent(in) :: a
real, dimension(n, n-1, n-1) :: b
real, dimension(n) :: local_det
real :: accumulation
integer :: j, k, sgn
if (n .eq. 1) then
accumulation = a(1,1)
else
!\$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(j,k)
do k=1,n
do j = 1, k-1
b(k, :, j) = a(2:, j)
end do
do j = k+1, n
b(k, :, j-1) = a(2:, j)
end do
local_det(k) = detser(b(k,:,:), n-1, permanent)
enddo
accumulation = 0
sgn = 1
do k=1,n
accumulation = accumulation + sgn * a(1, k) * local_det(k)
sgn = sgn * permanent
end do
endif
end function detpar

recursive function detser(a,n,permanent) result(accumulation)
integer, intent(in) :: n, permanent
real, dimension(n,n), intent(in) :: a
real, dimension(n-1, n-1) :: b
real :: accumulation
integer :: j, k, sgn
if (n .eq. 1) then
accumulation = a(1,1)
else
accumulation = 0
sgn = 1
do k=1, n
do j = 1, k-1
b(:, j) = a(2:, j)
end do
do j = k+1, n
b(:, j-1) = a(2:, j)
end do
accumulation = accumulation + sgn * a(1, k) * detser(b, n-1, permanent)
sgn = sgn * permanent
enddo
endif
end function detser

real function mytime()
integer(kind=4), dimension(3) :: TArray
call itime(TArray)
mytime = real(TArray(3) + 60 * (TArray(2) + 60 * (TArray(1))))
end function mytime

end program f```
This runs both cores together and gives correct answer. Is it faster?
The file /tmp/a contains
Code:
```-0.923274   -0.341432 -0.328711   0.971943 -0.883249 -0.435348  0.383719   0.45886   0.541108  -0.997454  -0.166181 -0.624975
-0.506547    0.791247  0.514168  -0.810503  0.186398 -0.761007 -0.730399 -0.215569  -0.521056    0.47368   0.192582  0.924478
0.0577579     0.17293  0.752496   0.663678 -0.516617 -0.119594 0.0427356 -0.122713  -0.689729  -0.202648  -0.998568 -0.723365
-0.56225 -0.00425167  0.662194 -0.0391025  0.218974  0.443948   0.99204  0.492961  -0.202468  -0.175486   0.786253 0.0606093
0.240993   -0.757153  -0.79544  0.0173647 -0.929971 -0.894012  0.235547 -0.517109   0.182032   -0.90274  -0.852879  0.212104
0.614065    0.030626 -0.286679   0.908969  0.604646  0.720185 -0.244579 -0.178661  -0.950055 -0.0120252  -0.105445 0.0531578
0.533407    0.174958  0.903334   0.156381  0.543367 -0.762282 -0.428918  0.709452  -0.252296  -0.487895   -0.14029  0.264112
-0.194226  -0.0715186  0.602734  -0.633329  0.648061 -0.377232   0.76431 -0.774184     0.2458  -0.563082   -0.52861  0.435124
0.426403   -0.965024 -0.723339  -0.279136 -0.667545 -0.677537  0.634901  0.557254  -0.235998  -0.999848 -0.0394405  0.378694
0.34791    0.921768  0.654681   0.660782  0.598307 0.0473692 -0.838722  0.414236  -0.390647   0.267905   0.231926  0.977179
0.60767   -0.481669  0.405516  -0.158412  -0.48456 0.0750193  0.932425 0.0998887 -0.0409722   0.273858  -0.716461  0.488875
0.297553   -0.252373  -0.02735   0.595922  0.919663  0.714662  0.963793  0.345256   0.538883   0.301958   0.923339  0.162875```
Last edited by b49P23TIvg; July 17th, 2013 at 07:34 AM.
4. #### For completeness...

Using a 4 core laptop computer, pgfortran and gfortran behave similarly with this program.

pgfortran
Code:
```\$ make g
"c:/Program Files/PGI/win64/13.6/bin/pgfortran.exe" -I../include "-Ic:\Program Files (x86)\PGI\win32\13.6\include" -Minform=warn -tp=px-32 -Mbackslash -mp -Lc:\PROGRA~2\PGI\win32\13.6\lib g.f90 -lstd -o g
j example, determinant:     7.000000
j example, permanent:       5.000000
maxima, determinant:       -360.0000
maxima, permanent:          900.0000
serial timing, det:         3.489127
j example, determinant:     7.000000
j example, permanent:       5.000000
maxima, determinant:       -360.0000
maxima, permanent:          900.0000
parallel timing, det:       3.489127
ratio parallel/serial:           51 %
\$```
gfortran
Code:
```\$ gfortran -m32 -Wall -fopenmp g.f90