Cleaned up notebook

Removed ! and added %%bash to all cells that should be interpreted by the
command-line shell. This makes copy paste of the relevant lines easier.
This commit is contained in:
F. Dijkstra 2017-06-20 13:11:32 +02:00
parent cce7f726c0
commit 53ea1d338c
1 changed files with 76 additions and 225 deletions

View File

@ -20,7 +20,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"metadata": {
"collapsed": false,
"scrolled": true
@ -30,33 +30,33 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Wed Jun 7 13:36:24 2017 \r\n",
"+-----------------------------------------------------------------------------+\r\n",
"| NVIDIA-SMI 375.66 Driver Version: 375.66 |\r\n",
"|-------------------------------+----------------------+----------------------+\r\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n",
"|===============================+======================+======================|\r\n",
"| 0 GeForce GTX 950 Off | 0000:01:00.0 On | N/A |\r\n",
"| 1% 54C P5 11W / 99W | 932MiB / 1996MiB | 0% Default |\r\n",
"+-------------------------------+----------------------+----------------------+\r\n",
" \r\n",
"+-----------------------------------------------------------------------------+\r\n",
"| Processes: GPU Memory |\r\n",
"| GPU PID Type Process name Usage |\r\n",
"|=============================================================================|\r\n",
"| 0 1974 G /usr/lib/xorg/Xorg 624MiB |\r\n",
"| 0 3776 G ...s-passed-by-fd --v8-snapshot-passed-by-fd 32MiB |\r\n",
"| 0 3875 G compiz 106MiB |\r\n",
"| 0 4275 G ...el-token=884290AA53D676228DE3F70F025B1D21 133MiB |\r\n",
"| 0 4324 G ...s-passed-by-fd --v8-snapshot-passed-by-fd 32MiB |\r\n",
"| 0 28457 G /usr/lib/firefox/firefox 1MiB |\r\n",
"+-----------------------------------------------------------------------------+\r\n"
"Tue Jun 20 13:10:41 2017 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 375.66 Driver Version: 375.66 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"|===============================+======================+======================|\n",
"| 0 GeForce GTX 950 Off | 0000:01:00.0 On | N/A |\n",
"| 23% 59C P0 27W / 99W | 690MiB / 1996MiB | 1% Default |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: GPU Memory |\n",
"| GPU PID Type Process name Usage |\n",
"|=============================================================================|\n",
"| 0 1982 G /usr/lib/xorg/Xorg 357MiB |\n",
"| 0 2997 G compiz 166MiB |\n",
"| 0 3233 G /usr/lib/firefox/firefox 1MiB |\n",
"| 0 3449 G ...s-passed-by-fd --v8-snapshot-passed-by-fd 25MiB |\n",
"| 0 11015 G ...el-token=53D41F0E8A4B8A669C123908959A0849 137MiB |\n",
"+-----------------------------------------------------------------------------+\n"
]
}
],
"source": [
"!nvidia-smi"
"%%bash\n",
"nvidia-smi"
]
},
{
@ -222,19 +222,20 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Compiled Successfully!\r\n"
"Compiled Successfully!\n"
]
}
],
"source": [
"%%bash\n",
"# To be sure we see some output from the compiler, we'll echo out \"Compiled Successfully!\" \n",
"#(if the compile does not return an error)\n",
"!pgcc -fast -o task1_pre_out task1/task1.c && echo \"Compiled Successfully!\""
"pgcc -fast -o task1_pre_out task1/task1.c && echo \"Compiled Successfully!\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {
"collapsed": false,
"scrolled": true
@ -255,14 +256,15 @@
" 700, 0.000345\n",
" 800, 0.000302\n",
" 900, 0.000269\n",
" total: 2.884395 s\n"
" total: 2.815460 s\n"
]
}
],
"source": [
"%%bash\n",
"# Execute our single-thread CPU-only Jacobi Iteration to get timing information. Make sure you compiled successfully in the \n",
"# above command first.\n",
"!./task1_pre_out"
"./task1_pre_out"
]
},
{
@ -301,7 +303,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"metadata": {
"collapsed": false,
"scrolled": true
@ -347,13 +349,14 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!pgprof"
"%%bash\n",
"pgprof"
]
},
{
@ -441,49 +444,17 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GetTimer:\n",
" 3, include \"timer.h\"\n",
" 62, FMA (fused multiply-add) instruction(s) generated\n",
"main:\n",
" 23, Loop not fused: function call before adjacent loop\n",
" Loop not vectorized: may not be beneficial\n",
" Unrolled inner loop 8 times\n",
" Generated 7 prefetches in scalar loop\n",
" 34, Loop not vectorized/parallelized: potential early exits\n",
" 38, Generating implicit copyout(Anew[1:1022][1:1022])\n",
" Generating implicit copyin(A[:][:])\n",
" Generating implicit copyout(A[1:1022][1:1022])\n",
" 41, Loop is parallelizable\n",
" 43, Loop is parallelizable\n",
" Accelerator kernel generated\n",
" Generating Tesla code\n",
" 41, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */\n",
" 43, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */\n",
" 47, Generating implicit reduction(max:error)\n",
" 52, Loop is parallelizable\n",
" 54, Loop is parallelizable\n",
" Accelerator kernel generated\n",
" Generating Tesla code\n",
" 52, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */\n",
" 54, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */\n",
"Compiled Successfully\n"
]
}
],
"outputs": [],
"source": [
"%%bash\n",
"# Compile the task2.c file with the pgcc compiler\n",
"# -acc tells the compiler to process the source recognizing #pragma acc directives\n",
"# -Minfo tells the compiler to share information about the compilation process\n",
"!pgcc -acc -Minfo -fast -ta=tesla -o task2_out task2/task2.c && echo \"Compiled Successfully\""
"pgcc -acc -Minfo -fast -ta=tesla -o task2_out task2/task2.c && echo \"Compiled Successfully\""
]
},
{
@ -541,32 +512,14 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Jacobi relaxation Calculation: 1024 x 1024 mesh\n",
" 0, 0.250000\n",
" 100, 0.002397\n",
" 200, 0.001204\n",
" 300, 0.000804\n",
" 400, 0.000603\n",
" 500, 0.000483\n",
" 600, 0.000403\n",
" 700, 0.000345\n",
" 800, 0.000302\n",
" 900, 0.000269\n",
" total: 3.403485 s\n"
]
}
],
"outputs": [],
"source": [
"!./task2_out"
"%%bash\n",
"./task2_out"
]
},
{
@ -649,37 +602,14 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"main:\n",
" 34, Generating create(Anew[:][:])\n",
" Generating copy(A[:][:])\n",
" 42, Loop is parallelizable\n",
" 44, Loop is parallelizable\n",
" Accelerator kernel generated\n",
" Generating Tesla code\n",
" 42, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */\n",
" 44, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */\n",
" 48, Generating implicit reduction(max:error)\n",
" 53, Loop is parallelizable\n",
" 55, Loop is parallelizable\n",
" Accelerator kernel generated\n",
" Generating Tesla code\n",
" 53, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */\n",
" 55, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */\n",
"Compiled Successfully\n"
]
}
],
"outputs": [],
"source": [
"!pgcc -fast -acc -Minfo=accel -ta=tesla -o task3_out task3/task3.c && echo \"Compiled Successfully\""
"%%bash\n",
"pgcc -fast -acc -Minfo=accel -ta=tesla -o task3_out task3/task3.c && echo \"Compiled Successfully\""
]
},
{
@ -691,32 +621,14 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Jacobi relaxation Calculation: 1024 x 1024 mesh\n",
" 0, 0.250000\n",
" 100, 0.002397\n",
" 200, 0.001204\n",
" 300, 0.000804\n",
" 400, 0.000603\n",
" 500, 0.000483\n",
" 600, 0.000403\n",
" 700, 0.000345\n",
" 800, 0.000302\n",
" 900, 0.000269\n",
" total: 0.601428 s\n"
]
}
],
"outputs": [],
"source": [
"!./task3_out"
"%%bash\n",
"./task3_out"
]
},
{
@ -756,79 +668,11 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Jacobi relaxation Calculation: 1024 x 1024 mesh\n",
" 0, 0.250000\n",
" 100, 0.002397\n",
" 200, 0.001204\n",
" 300, 0.000804\n",
" 400, 0.000603\n",
" 500, 0.000483\n",
" 600, 0.000403\n",
" 700, 0.000345\n",
" 800, 0.000302\n",
" 900, 0.000269\n",
" total: 0.581272 s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"main:\n",
" 34, Generating create(Anew[:][:])\n",
" Generating copy(A[:][:])\n",
" 42, Loop is parallelizable\n",
" 44, Loop is parallelizable\n",
" Accelerator kernel generated\n",
" Generating Tesla code\n",
" 42, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */\n",
" 44, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */\n",
" 48, Generating implicit reduction(max:error)\n",
" 53, Loop is parallelizable\n",
" 55, Loop is parallelizable\n",
" Accelerator kernel generated\n",
" Generating Tesla code\n",
" 53, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */\n",
" 55, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */\n",
"\n",
"Accelerator Kernel Timing data\n",
"/home/fokke/OpenACC/labs/lab1/notebook/C/task3/task3.c\n",
" main NVIDIA devicenum=0\n",
" time(us): 425,590\n",
" 34: data region reached 2 times\n",
" 34: data copyin transfers: 1\n",
" device time(us): total=352 max=352 min=352 avg=352\n",
" 68: data copyout transfers: 1\n",
" device time(us): total=336 max=336 min=336 avg=336\n",
" 37: compute region reached 1000 times\n",
" 37: data copyin transfers: 1000\n",
" device time(us): total=2,452 max=13 min=2 avg=2\n",
" 44: kernel launched 1000 times\n",
" grid: [32x256] block: [32x4]\n",
" device time(us): total=307,190 max=311 min=305 avg=307\n",
" elapsed time(us): total=318,460 max=341 min=316 avg=318\n",
" 44: reduction kernel launched 1000 times\n",
" grid: [1] block: [256]\n",
" device time(us): total=13,053 max=19 min=13 avg=13\n",
" elapsed time(us): total=24,253 max=47 min=23 avg=24\n",
" 44: data copyout transfers: 1000\n",
" device time(us): total=7,380 max=20 min=7 avg=7\n",
" 55: kernel launched 1000 times\n",
" grid: [32x256] block: [32x4]\n",
" device time(us): total=94,827 max=118 min=92 avg=94\n",
" elapsed time(us): total=108,023 max=1,019 min=104 avg=108\n"
]
}
],
"outputs": [],
"source": [
"%%bash\n",
"export PGI_ACC_TIME=1\n",
@ -928,22 +772,24 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"!pgcc -acc -Minfo=accel -fast -ta=tesla -o task4_out task4/task4.c && echo \"Compiled Successfully\""
"%%bash\n",
"pgcc -acc -Minfo=accel -fast -ta=tesla -o task4_out task4/task4.c && echo \"Compiled Successfully\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"!./task4_out"
"%%bash\n",
"./task4_out"
]
},
{
@ -957,7 +803,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
@ -985,11 +831,12 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"!pgcc -acc -fast -ta=tesla -Minfo=accel -o task4_out task4/task4.c && echo \"Compiled Successfully\""
"%%bash\n",
"pgcc -acc -fast -ta=tesla -Minfo=accel -o task4_out task4/task4.c && echo \"Compiled Successfully\""
]
},
{
@ -1003,11 +850,12 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"!./task4_out"
"%%bash\n",
"./task4_out"
]
},
{
@ -1038,7 +886,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
@ -1059,11 +907,12 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"!OMP_NUM_THREADS=8 ./task4_4096_omp"
"%%bash\n",
"OMP_NUM_THREADS=8 ./task4_4096_omp"
]
},
{
@ -1077,22 +926,24 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"!pgcc -acc -fast -ta=tesla -Minfo=accel -o task4_4096_out task4/task4_4096_solution.c && echo \"Compiled Successfully\""
"%%bash\n",
"pgcc -acc -fast -ta=tesla -Minfo=accel -o task4_4096_out task4/task4_4096_solution.c && echo \"Compiled Successfully\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"!./task4_4096_out"
"%%bash\n",
"./task4_4096_out"
]
},
{