[環境]
- Windows 7 Prof. 64bit/8GB
- Core i7-720M + GTX480M(2GB)
- VS2008 Std.
- PGI Visual fortran 2011 11.5 + CUDA3.2 + latest driver
- Option: -fastsse -O3 -Minline -tp=nehalem-64 -ta=nvidia,wait,cuda3.1,cc20,time -Minform=warn -Minfo=accel,ftn
(cuda3.2とするとなぜか内部エラーになるのでバージョンアップ待ち)
お約束で、行列積のテスト問題
! ! ConsoleApp.f90 ! ! Fortran Console Application ! Generated by PGI Visual Fortran(R) ! 2011/06/11 8:24:40 ! program prog #if defined (_ACCEL) use accel_lib #endif implicit none ! Variables integer, parameter :: N = 1000 real(8) :: a(N, N), b(N, N) real(8) :: c(N, N) integer :: i,j,k integer :: hz, clock0, clock1 real :: walltime, gflops ! Body ! initialize a, b a(:,:) = 0d0 b(:,:) = 0d0 do i = 1, N a(i,i) = dble(N-i) b(i,i) = dble(i-1) enddo call system_clock(count_rate=hz) print '(/1x,a,e12.6)', "system_clock resolution: ", real(1.d0/hz) !--- Host computing call system_clock(count=clock0) ! calc C = AB do j=1, N do i=1, N c(i,j) = 0.0d0 enddo do k=1, N do i=1, N c(i,j) = c(i,j) + a(i,k) * b(k,j) enddo enddo enddo call system_clock(count=clock1) walltime= real(clock1-clock0)/real(hz) gflops = 2*real(N)*real(N)*real(N)/walltime*1.0e-9 print *,"Matrix size: ", N print *,"Host CPU time:",walltime,"sec" print *, gflops,"GFLOPS" call sleep(5) !--- Device computing (PGI Accel. fortran) #if defined (_ACCEL) call system_clock(count=clock0) call acc_set_device( acc_device_nvidia ) call acc_init( acc_device_nvidia ) call system_clock(count=clock1) walltime= real*1 / real (hz) print '(1x,a,F8.4,a)', 'initialize nvidia, initilize time = ',walltime," second" #endif call system_clock(count=clock0) ! calc C = AB !$acc region !$acc do parallel,vector(16) do j=1, N do i=1, N c(i,j) = 0.0d0 enddo do k=1, N !$acc do parallel,vector(16) do i=1, N c(i,j) = c(i,j) + a(i,k) * b(k,j) enddo enddo enddo !$acc end region call system_clock(count=clock1) walltime= real(clock1-clock0)/real(hz) gflops = 2*real(N)*real(N)*real(N)/walltime*1.0e-9 print *,"Matrix size : ", N, " X ",N print *,"Host CPU time:",walltime,"sec" print *, gflops,"GFLOPS" ! print for check !do j=1, N ! write(*,'(10F4.1)') (c(i,j),i=1,N) !enddo end program prog
[結果]
system_clock resolution: 0.100000E-05
Matrix size: 1000
Host CPU time: 0.9070000 sec
2.205071 GFLOPS
initialize nvidia, initilize time = 0.1680 second
Matrix size : 1000 X 1000
Host CPU time: 6.4000003E-02 sec
31.25000 GFLOPS
(倍精度演算で14倍くらい。こんなもん?)
*1:clock1-clock0