heat2d.f90
program heat2D
!$use OMP_LIB
implicit none
 
	integer :: niter,i,j,t,rank
	real(8), allocatable, dimension(:,:) :: U0,U1
	real(8) :: alpha,totaltime,Dt,Stab,DtAlpha
	integer, dimension(1:2) :: Nx
	real(8), dimension(1:2) :: Lx,Dx,DxDx,InvDxDx
	character(len=4) :: charbuff
 
	niter = 8000
 
	Nx(:) = 700
	alpha = 0.4d0
	totaltime = 0.005d0
	Dt = totaltime/((niter-1)*1.0d0)
	Lx(:) = 1.0d0
	Dx(:) = Lx(:)/((Nx(:)-1)*1.0d0)
	InvDxDx(:) = 1.0d0/(Dx(:)*Dx(:))
	DxDx(:) = Dx(:)*Dx(:)
	Stab = alpha*Dt*(InvDxDx(1)) + alpha*Dt*(InvDxDx(2))
	DtAlpha = Dt*alpha
	print *, "Stability factor (need < 0.5): ",stab
	if(stab > 0.5d0) stop
	print *,"Calculating, please wait..."
	allocate(U0(0:Nx(1)+1,0:Nx(2)+1),U1(0:Nx(1)+1,0:Nx(2)+1))
 
	U0(:,:) = 1.0d0
	U0(100:150,200:250) = -10.0d0
	U0(300:350,400:450) = +12.0d0
	U1(:,:) = U0(:,:)
 
	rank = 0
	write(charbuff,'(I4.4)') rank
	open(10+rank,file=trim(adjustl('IN'//'.'//trim(adjustl(charbuff))//'.dat')))
	do j=1,Nx(2),10
		do i=1,Nx(1),10
			write(10,*) i,j+(rank*Nx(2)),U0(i,j)
		end do
	end do
	close(10+rank)
 
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,t)
	do t=1,niter
		! print *,t
 
!$OMP DO SCHEDULE(RUNTIME)
		do j=1,Nx(2)
			do i=1,Nx(1)
				U1(i,j) = U0(i,j) + DtAlpha* &
				     & ( (U0(i+1,j)-2.0*U0(i,j)+U0(i-1,j))*InvDxDx(1) + &
			         &   (U0(i,j+1)-2.0*U0(i,j)+U0(i,j-1))*InvDxDx(2) )
 
			end do
		end do
!$OMP END DO
 
!$OMP DO SCHEDULE(RUNTIME)
		do j=1,Nx(2)
			do i=1,Nx(1)
				U0(i,j) = U1(i,j) + DtAlpha* &
				     & ( (U1(i+1,j)-2.0*U1(i,j)+U1(i-1,j))*InvDxDx(1) + &
				     &   (U1(i,j+1)-2.0*U1(i,j)+U1(i,j-1))*InvDxDx(2) )
 
			end do
		end do
!$OMP END DO
 
	end do
!$OMP END PARALLEL
 
	rank = 0
	write(charbuff,'(I4.4)') rank
	open(10+rank,file=trim(adjustl('OUT'//'.'//trim(adjustl(charbuff))//'.dat')))
	do j=1,Nx(2),10
		do i=1,Nx(1),10
			write(10,*) i,j+(rank*Nx(2)),U0(i,j)
		end do
	end do
	close(10+rank)
 
	deallocate(U0,U1)
 
end program heat2D

2 versions, both from Adrien Cassagne. One with 1D arrays, second one with 2D arrays.

heat2d1D.c
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
 
int main(int argc, char** argv) 
{
	int niter,i,j,t,rank;
	double alpha,totaltime,Dt,Stab,DtAlpha;
	int Nx[2];
	double Lx[2], Dx[2], DxDx[2], InvDxDx[2];
 
	// parameters
	niter     = 8000;
	Nx[0]     = 700; 
	Nx[1]     = 700;
	alpha     = 0.4;
	totaltime = 0.005;
 
	// initializations computation
	Dt = totaltime/((niter-1)*1.0);
	Lx[0] = 1.0; 
	Lx[1] = 1.0;
 
	Dx[0] = Lx[0]/((Nx[0]-1)*1.0); 
	Dx[1] = Lx[1]/((Nx[1]-1)*1.0);
 
	InvDxDx[0] = 1.0/(Dx[0]*Dx[0]); 
	InvDxDx[1] = 1.0/(Dx[1]*Dx[1]);
 
	DxDx[0] = Dx[0]*Dx[0]; 
	DxDx[1] = Dx[1]*Dx[1];
 
	Stab    = alpha*Dt*(InvDxDx[0]) + alpha*Dt*(InvDxDx[1]);
	DtAlpha = Dt*alpha;
 
	// verify stability condition
	printf("Stability factor (need < 0.5): %lf\n", Stab);
	if(Stab > 0.5) 
		exit(0);
 
	double *U0 = malloc((Nx[0]+2) * (Nx[1]+2) * sizeof(double));
	double *U1 = malloc((Nx[0]+2) * (Nx[1]+2) * sizeof(double));
 
	// init grid
	for(j = 0; j < Nx[1]+2; ++j)
		for(i = 0; i < Nx[0]+2; ++i) 
		{ 
			U0[i+j*Nx[0]] = 1.0;
			U1[i+j*Nx[0]] = 1.0;
			if(i >= 100 && i <= 150 && j>=200 && j<=250) { U0[i+j*Nx[0]] = -10.0; U1[i+j*Nx[0]] = -10.0;}
			if(i >= 300 && i <= 350 && j>=400 && j<=450) { U0[i+j*Nx[0]] =  12.0; U1[i+j*Nx[0]] =  12.0;}
		}
 
	rank = 0;
 
	char fileName[2048];
	sprintf(fileName, "IN.%d.dat", rank);
 
	FILE *file = NULL;
	file = fopen(fileName, "w");
	for(j = 1; j < Nx[0]+1; j += 10)
		for(i = 1; i < Nx[1]+1; i += 10)
			fprintf(file, "%d\t%d\t%lf\n", i, j, U0[i+j*Nx[0]]);
	fclose(file);
 
	printf("Calculating, please wait...\n");
#pragma omp parallel default(shared) private(t,i,j)
	for(t = 0; t < niter; ++t) 
	{
#pragma omp for schedule(runtime)
		for(j = 1; j < Nx[1]+1; ++j)
			for(i = 1; i < Nx[0]+1; ++i) 
				U1[i+j*Nx[0]] = U0[i+j*Nx[0]] + DtAlpha * ((U0[i+1+j*Nx[0]]   -2.0*U0[i+j*Nx[0]] + U0[i-1+j*Nx[0]]  )*InvDxDx[0] + 
				                                           (U0[i+(j+1)*Nx[0]] -2.0*U0[i+j*Nx[0]] + U0[i+(j-1)*Nx[0]])*InvDxDx[1]);
#pragma omp for schedule(runtime)
		for(j = 1; j < Nx[1]+1; ++j)
			for(i = 1; i < Nx[0]+1; ++i) 
				U0[i+j*Nx[0]] = U1[i+j*Nx[0]] + DtAlpha * ((U1[i+1+j*Nx[0]]   -2.0*U1[i+j*Nx[0]] + U1[i-1+j*Nx[0]]  )*InvDxDx[0] + 
				                                           (U1[i+(j+1)*Nx[0]] -2.0*U1[i+j*Nx[0]] + U1[i+(j-1)*Nx[0]])*InvDxDx[1] );
	}
 
	sprintf(fileName, "OUT.%d.dat", rank);
	file = fopen(fileName, "w");
	for(j = 1; j < Nx[1]+1; j += 10)
		for(i = 1; i < Nx[0]+1; i += 10)
			fprintf(file, "%d\t%d\t%lf\n", i, j, U0[i+j*Nx[0]]);
	fclose(file);
 
	free(U0);
	free(U1);
 
	return 0;
}
heat2d2D.c
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
 
int main(int argc, char** argv) 
{
	int niter,i,j,t,rank;
	double alpha,totaltime,Dt,Stab,DtAlpha;
	int Nx[2];
	double Lx[2], Dx[2], DxDx[2], InvDxDx[2];
 
	// parameters
	niter     = 8000;
	Nx[0]     = 700; 
	Nx[1]     = 700;
	alpha     = 0.4;
	totaltime = 0.005;
 
	// initializations computation
	Dt = totaltime/((niter-1)*1.0);
	Lx[0] = 1.0; 
	Lx[1] = 1.0;
 
	Dx[0] = Lx[0]/((Nx[0]-1)*1.0); 
	Dx[1] = Lx[1]/((Nx[1]-1)*1.0);
 
	InvDxDx[0] = 1.0/(Dx[0]*Dx[0]); 
	InvDxDx[1] = 1.0/(Dx[1]*Dx[1]);
 
	DxDx[0] = Dx[0]*Dx[0]; 
	DxDx[1] = Dx[1]*Dx[1];
 
	Stab    = alpha*Dt*(InvDxDx[0]) + alpha*Dt*(InvDxDx[1]);
	DtAlpha = Dt*alpha;
 
	// verify stability condition
	printf("Stability factor (need < 0.5): %lf\n", Stab);
	if(Stab > 0.5) 
		exit(0);
 
	// dynamic allocations
	double **U0 = (double**)malloc((Nx[0]+2) * sizeof(double*));
	for(i = 0; i < Nx[0]+2; ++i)
		U0[i] = (double*)malloc((Nx[1]+2) * sizeof(double));
 
	double **U1 = (double**)malloc((Nx[0]+2) * sizeof(double*));	
	for(i = 0; i < Nx[0]+2; ++i)
		U1[i] = (double*)malloc((Nx[1]+2) * sizeof(double));
 
	// init grid
	for(i = 0; i < Nx[0]+2; ++i)
		for(j = 0; j < Nx[1]+2; ++j) 
		{ 
			U0[i][j] = 1.0;
			U1[i][j] = 1.0;
			if(i >= 100 && i <= 150 && j>=200 && j<=250) { 
				U0[i][j] = -10.0;
				U1[i][j] = -10.0;
			}
			if(i >= 300 && i <= 350 && j>=400 && j<=450) {
				U0[i][j] =  12.0;
				U1[i][j] =  12.0;
			}
		}
 
	rank = 0;
 
	char fileName[2048];
	sprintf(fileName, "IN.%d.dat", rank);
 
	// print inpout
	FILE *file = NULL;
	file = fopen(fileName, "w");
	/*
	for(i = 1; i < Nx[0]+1; i += 10)
		for(j = 1; j < Nx[1]+1; j += 10)
			fprintf(file, "%d\t%d\t%lf\n", i, j, U0[i][j]);
	*/
	for(j = 1; j < Nx[1]+1; j += 10)
		for(i = 1; i < Nx[0]+1; i += 10)
			fprintf(file, "%d\t%d\t%lf\n", i, j, U0[i][j]);
	fclose(file);
 
	printf("Calculating, please wait...\n");
#pragma omp parallel default(shared) private(t,i,j)
	for(t = 0; t < niter; ++t) 
	{
#pragma omp for schedule(runtime)
		for(i = 1; i < Nx[0]+1; ++i)
			for(j = 1; j < Nx[1]+1; ++j) 
				U1[i][j] = U0[i][j] + DtAlpha * ((U0[i+1][j  ] -2.0*U0[i][j] + U0[i-1][j  ])*InvDxDx[0] + 
				                                 (U0[i  ][j+1] -2.0*U0[i][j] + U0[i  ][j-1])*InvDxDx[1]);
#pragma omp for schedule(runtime)
		for(i = 1; i < Nx[0]+1; ++i)
			for(j = 1; j < Nx[1]+1; ++j) 
				U0[i][j] = U1[i][j] + DtAlpha * ((U1[i+1][j  ] -2.0*U1[i][j] + U1[i-1][j  ])*InvDxDx[0] + 
				                                 (U1[i  ][j+1] -2.0*U1[i][j] + U1[i  ][j-1])*InvDxDx[1]);
	}
 
	// print output
	sprintf(fileName, "OUT.%d.dat", rank);
	file = fopen(fileName, "w");
	/*
	for(i = 1; i < Nx[0]+1; i += 10)
		for(j = 1; j < Nx[1]+1; j += 10)
			fprintf(file, "%d\t%d\t%lf\n", i, j, U0[i][j]);
	*/
	for(j = 1; j < Nx[1]+1; j += 10)
		for(i = 1; i < Nx[0]+1; i += 10)
			fprintf(file, "%d\t%d\t%lf\n", i, j, U0[i][j]);
	fclose(file);
 
	// free buffers
	for(i = 0; i < Nx[0]+2; ++i)
		free(U0[i]);
	free(U0);
	for(i = 0; i < Nx[0]+2; ++i)
		free(U1[i]);
	free(U1);
 
	return 0;
}