diff --git a/lab1/C/OpenACC C.ipynb b/lab1/C/OpenACC C.ipynb index 2e96cb0..c105adb 100644 --- a/lab1/C/OpenACC C.ipynb +++ b/lab1/C/OpenACC C.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": { "collapsed": false, "scrolled": true @@ -30,33 +30,33 @@ "name": "stdout", "output_type": "stream", "text": [ - "Wed Jun 7 13:36:24 2017 \r\n", - "+-----------------------------------------------------------------------------+\r\n", - "| NVIDIA-SMI 375.66 Driver Version: 375.66 |\r\n", - "|-------------------------------+----------------------+----------------------+\r\n", - "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n", - "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n", - "|===============================+======================+======================|\r\n", - "| 0 GeForce GTX 950 Off | 0000:01:00.0 On | N/A |\r\n", - "| 1% 54C P5 11W / 99W | 932MiB / 1996MiB | 0% Default |\r\n", - "+-------------------------------+----------------------+----------------------+\r\n", - " \r\n", - "+-----------------------------------------------------------------------------+\r\n", - "| Processes: GPU Memory |\r\n", - "| GPU PID Type Process name Usage |\r\n", - "|=============================================================================|\r\n", - "| 0 1974 G /usr/lib/xorg/Xorg 624MiB |\r\n", - "| 0 3776 G ...s-passed-by-fd --v8-snapshot-passed-by-fd 32MiB |\r\n", - "| 0 3875 G compiz 106MiB |\r\n", - "| 0 4275 G ...el-token=884290AA53D676228DE3F70F025B1D21 133MiB |\r\n", - "| 0 4324 G ...s-passed-by-fd --v8-snapshot-passed-by-fd 32MiB |\r\n", - "| 0 28457 G /usr/lib/firefox/firefox 1MiB |\r\n", - "+-----------------------------------------------------------------------------+\r\n" + "Tue Jun 20 13:10:41 2017 \n", + "+-----------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 375.66 Driver Version: 375.66 |\n", + "|-------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", + "|===============================+======================+======================|\n", + "| 0 GeForce GTX 950 Off | 0000:01:00.0 On | N/A |\n", + "| 23% 59C P0 27W / 99W | 690MiB / 1996MiB | 1% Default |\n", + "+-------------------------------+----------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------+\n", + "| Processes: GPU Memory |\n", + "| GPU PID Type Process name Usage |\n", + "|=============================================================================|\n", + "| 0 1982 G /usr/lib/xorg/Xorg 357MiB |\n", + "| 0 2997 G compiz 166MiB |\n", + "| 0 3233 G /usr/lib/firefox/firefox 1MiB |\n", + "| 0 3449 G ...s-passed-by-fd --v8-snapshot-passed-by-fd 25MiB |\n", + "| 0 11015 G ...el-token=53D41F0E8A4B8A669C123908959A0849 137MiB |\n", + "+-----------------------------------------------------------------------------+\n" ] } ], "source": [ - "!nvidia-smi" + "%%bash\n", + "nvidia-smi" ] }, { @@ -222,19 +222,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "Compiled Successfully!\r\n" + "Compiled Successfully!\n" ] } ], "source": [ + "%%bash\n", "# To be sure we see some output from the compiler, we'll echo out \"Compiled Successfully!\" \n", "#(if the compile does not return an error)\n", - "!pgcc -fast -o task1_pre_out task1/task1.c && echo \"Compiled Successfully!\"" + "pgcc -fast -o task1_pre_out task1/task1.c && echo \"Compiled Successfully!\"" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "collapsed": false, "scrolled": true @@ -255,14 +256,15 @@ " 700, 0.000345\n", " 800, 0.000302\n", " 900, 0.000269\n", - " total: 2.884395 s\n" + " total: 2.815460 s\n" ] } ], "source": [ + "%%bash\n", "# Execute our single-thread CPU-only Jacobi Iteration to get timing information. Make sure you compiled successfully in the \n", "# above command first.\n", - "!./task1_pre_out" + "./task1_pre_out" ] }, { @@ -301,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": { "collapsed": false, "scrolled": true @@ -347,13 +349,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ - "!pgprof" + "%%bash\n", + "pgprof" ] }, { @@ -441,49 +444,17 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "GetTimer:\n", - " 3, include \"timer.h\"\n", - " 62, FMA (fused multiply-add) instruction(s) generated\n", - "main:\n", - " 23, Loop not fused: function call before adjacent loop\n", - " Loop not vectorized: may not be beneficial\n", - " Unrolled inner loop 8 times\n", - " Generated 7 prefetches in scalar loop\n", - " 34, Loop not vectorized/parallelized: potential early exits\n", - " 38, Generating implicit copyout(Anew[1:1022][1:1022])\n", - " Generating implicit copyin(A[:][:])\n", - " Generating implicit copyout(A[1:1022][1:1022])\n", - " 41, Loop is parallelizable\n", - " 43, Loop is parallelizable\n", - " Accelerator kernel generated\n", - " Generating Tesla code\n", - " 41, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */\n", - " 43, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */\n", - " 47, Generating implicit reduction(max:error)\n", - " 52, Loop is parallelizable\n", - " 54, Loop is parallelizable\n", - " Accelerator kernel generated\n", - " Generating Tesla code\n", - " 52, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */\n", - " 54, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */\n", - "Compiled Successfully\n" - ] - } - ], + "outputs": [], "source": [ + "%%bash\n", "# Compile the task2.c file with the pgcc compiler\n", "# -acc tells the compiler to process the source recognizing #pragma acc directives\n", "# -Minfo tells the compiler to share information about the compilation process\n", - "!pgcc -acc -Minfo -fast -ta=tesla -o task2_out task2/task2.c && echo \"Compiled Successfully\"" + "pgcc -acc -Minfo -fast -ta=tesla -o task2_out task2/task2.c && echo \"Compiled Successfully\"" ] }, { @@ -541,32 +512,14 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Jacobi relaxation Calculation: 1024 x 1024 mesh\n", - " 0, 0.250000\n", - " 100, 0.002397\n", - " 200, 0.001204\n", - " 300, 0.000804\n", - " 400, 0.000603\n", - " 500, 0.000483\n", - " 600, 0.000403\n", - " 700, 0.000345\n", - " 800, 0.000302\n", - " 900, 0.000269\n", - " total: 3.403485 s\n" - ] - } - ], + "outputs": [], "source": [ - "!./task2_out" + "%%bash\n", + "./task2_out" ] }, { @@ -649,37 +602,14 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "main:\n", - " 34, Generating create(Anew[:][:])\n", - " Generating copy(A[:][:])\n", - " 42, Loop is parallelizable\n", - " 44, Loop is parallelizable\n", - " Accelerator kernel generated\n", - " Generating Tesla code\n", - " 42, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */\n", - " 44, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */\n", - " 48, Generating implicit reduction(max:error)\n", - " 53, Loop is parallelizable\n", - " 55, Loop is parallelizable\n", - " Accelerator kernel generated\n", - " Generating Tesla code\n", - " 53, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */\n", - " 55, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */\n", - "Compiled Successfully\n" - ] - } - ], + "outputs": [], "source": [ - "!pgcc -fast -acc -Minfo=accel -ta=tesla -o task3_out task3/task3.c && echo \"Compiled Successfully\"" + "%%bash\n", + "pgcc -fast -acc -Minfo=accel -ta=tesla -o task3_out task3/task3.c && echo \"Compiled Successfully\"" ] }, { @@ -691,32 +621,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Jacobi relaxation Calculation: 1024 x 1024 mesh\n", - " 0, 0.250000\n", - " 100, 0.002397\n", - " 200, 0.001204\n", - " 300, 0.000804\n", - " 400, 0.000603\n", - " 500, 0.000483\n", - " 600, 0.000403\n", - " 700, 0.000345\n", - " 800, 0.000302\n", - " 900, 0.000269\n", - " total: 0.601428 s\n" - ] - } - ], + "outputs": [], "source": [ - "!./task3_out" + "%%bash\n", + "./task3_out" ] }, { @@ -756,79 +668,11 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Jacobi relaxation Calculation: 1024 x 1024 mesh\n", - " 0, 0.250000\n", - " 100, 0.002397\n", - " 200, 0.001204\n", - " 300, 0.000804\n", - " 400, 0.000603\n", - " 500, 0.000483\n", - " 600, 0.000403\n", - " 700, 0.000345\n", - " 800, 0.000302\n", - " 900, 0.000269\n", - " total: 0.581272 s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "main:\n", - " 34, Generating create(Anew[:][:])\n", - " Generating copy(A[:][:])\n", - " 42, Loop is parallelizable\n", - " 44, Loop is parallelizable\n", - " Accelerator kernel generated\n", - " Generating Tesla code\n", - " 42, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */\n", - " 44, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */\n", - " 48, Generating implicit reduction(max:error)\n", - " 53, Loop is parallelizable\n", - " 55, Loop is parallelizable\n", - " Accelerator kernel generated\n", - " Generating Tesla code\n", - " 53, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */\n", - " 55, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */\n", - "\n", - "Accelerator Kernel Timing data\n", - "/home/fokke/OpenACC/labs/lab1/notebook/C/task3/task3.c\n", - " main NVIDIA devicenum=0\n", - " time(us): 425,590\n", - " 34: data region reached 2 times\n", - " 34: data copyin transfers: 1\n", - " device time(us): total=352 max=352 min=352 avg=352\n", - " 68: data copyout transfers: 1\n", - " device time(us): total=336 max=336 min=336 avg=336\n", - " 37: compute region reached 1000 times\n", - " 37: data copyin transfers: 1000\n", - " device time(us): total=2,452 max=13 min=2 avg=2\n", - " 44: kernel launched 1000 times\n", - " grid: [32x256] block: [32x4]\n", - " device time(us): total=307,190 max=311 min=305 avg=307\n", - " elapsed time(us): total=318,460 max=341 min=316 avg=318\n", - " 44: reduction kernel launched 1000 times\n", - " grid: [1] block: [256]\n", - " device time(us): total=13,053 max=19 min=13 avg=13\n", - " elapsed time(us): total=24,253 max=47 min=23 avg=24\n", - " 44: data copyout transfers: 1000\n", - " device time(us): total=7,380 max=20 min=7 avg=7\n", - " 55: kernel launched 1000 times\n", - " grid: [32x256] block: [32x4]\n", - " device time(us): total=94,827 max=118 min=92 avg=94\n", - " elapsed time(us): total=108,023 max=1,019 min=104 avg=108\n" - ] - } - ], + "outputs": [], "source": [ "%%bash\n", "export PGI_ACC_TIME=1\n", @@ -928,22 +772,24 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "!pgcc -acc -Minfo=accel -fast -ta=tesla -o task4_out task4/task4.c && echo \"Compiled Successfully\"" + "%%bash\n", + "pgcc -acc -Minfo=accel -fast -ta=tesla -o task4_out task4/task4.c && echo \"Compiled Successfully\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "!./task4_out" + "%%bash\n", + "./task4_out" ] }, { @@ -957,7 +803,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -985,11 +831,12 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "!pgcc -acc -fast -ta=tesla -Minfo=accel -o task4_out task4/task4.c && echo \"Compiled Successfully\"" + "%%bash\n", + "pgcc -acc -fast -ta=tesla -Minfo=accel -o task4_out task4/task4.c && echo \"Compiled Successfully\"" ] }, { @@ -1003,11 +850,12 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "!./task4_out" + "%%bash\n", + "./task4_out" ] }, { @@ -1038,7 +886,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -1059,11 +907,12 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "!OMP_NUM_THREADS=8 ./task4_4096_omp" + "%%bash\n", + "OMP_NUM_THREADS=8 ./task4_4096_omp" ] }, { @@ -1077,22 +926,24 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "!pgcc -acc -fast -ta=tesla -Minfo=accel -o task4_4096_out task4/task4_4096_solution.c && echo \"Compiled Successfully\"" + "%%bash\n", + "pgcc -acc -fast -ta=tesla -Minfo=accel -o task4_4096_out task4/task4_4096_solution.c && echo \"Compiled Successfully\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "!./task4_4096_out" + "%%bash\n", + "./task4_4096_out" ] }, {