いわて駐在研究日誌2。

NEVER STAND BEHIND ME

PGI accelerated fortran (1)

[環境]

  • Windows 7 Prof. 64bit/8GB
  • Core i7-720M + GTX480M(2GB)
  • VS2008 Std.
  • PGI Visual fortran 2011 11.5 + CUDA3.2 + latest driver
  • Option: -fastsse -O3 -Minline -tp=nehalem-64 -ta=nvidia,wait,cuda3.1,cc20,time -Minform=warn -Minfo=accel,ftn

(cuda3.2とするとなぜか内部エラーになるのでバージョンアップ待ち)


お約束で、行列積のテスト問題

!
!  ConsoleApp.f90
!
!  Fortran Console Application 
!  Generated by PGI Visual Fortran(R)
!  2011/06/11 8:24:40
!

      program prog

#if defined (_ACCEL)
	use accel_lib
#endif

      implicit none

      ! Variables

      integer, parameter :: N = 1000
      real(8) :: a(N, N), b(N, N)
      real(8) :: c(N, N)
      integer :: i,j,k
      
      integer :: hz, clock0, clock1
      real    :: walltime, gflops

      ! Body
      
      ! initialize a, b
      a(:,:) = 0d0
      b(:,:) = 0d0
      do i = 1, N
        a(i,i) = dble(N-i)
        b(i,i) = dble(i-1)
      enddo   
      
      call system_clock(count_rate=hz)
      print '(/1x,a,e12.6)', "system_clock resolution: ", real(1.d0/hz)

  !--- Host computing
      
      call system_clock(count=clock0)

      ! calc C = AB
      
      do j=1, N
        
        do i=1, N
          c(i,j) = 0.0d0
        enddo
        
        do k=1, N
          do i=1, N
            c(i,j) = c(i,j) + a(i,k) * b(k,j)
          enddo
        enddo
       
     enddo
     
     call system_clock(count=clock1)
     
     walltime= real(clock1-clock0)/real(hz)
     gflops = 2*real(N)*real(N)*real(N)/walltime*1.0e-9
      
	 print *,"Matrix size: ", N 
	 print *,"Host CPU time:",walltime,"sec"
	 print *, gflops,"GFLOPS"
	 
	 call sleep(5)

  !--- Device computing (PGI Accel. fortran)
     
#if defined (_ACCEL)     

      call system_clock(count=clock0)
      call acc_set_device( acc_device_nvidia )
      call acc_init( acc_device_nvidia )
      
      call system_clock(count=clock1)
      walltime= real*1 / real (hz)
      print '(1x,a,F8.4,a)', 'initialize nvidia, initilize time = ',walltime," second"
      
#endif
   
      call system_clock(count=clock0)
      
      ! calc C = AB
        
!$acc region
!$acc do parallel,vector(16)
      do j=1, N
        do i=1, N
          c(i,j) = 0.0d0
        enddo
        do k=1, N
!$acc do parallel,vector(16)        
          do i=1, N
            c(i,j) = c(i,j) + a(i,k) * b(k,j)
          enddo
        enddo
     enddo
!$acc end region
     
     call system_clock(count=clock1)
     
     walltime= real(clock1-clock0)/real(hz)
     gflops = 2*real(N)*real(N)*real(N)/walltime*1.0e-9
      
	 print *,"Matrix size : ", N, " X ",N 
	 print *,"Host CPU time:",walltime,"sec"
	 print *, gflops,"GFLOPS"
	 
     ! print for check
     !do j=1, N
     !  write(*,'(10F4.1)') (c(i,j),i=1,N)
     !enddo

      end program prog

[結果]
system_clock resolution: 0.100000E-05

Matrix size: 1000
Host CPU time: 0.9070000 sec
2.205071 GFLOPS

initialize nvidia, initilize time = 0.1680 second
Matrix size : 1000 X 1000
Host CPU time: 6.4000003E-02 sec
31.25000 GFLOPS
(倍精度演算で14倍くらい。こんなもん?)

*1:clock1-clock0