From 00f202bddb06486f7e72325eb8a1b0d626d7d46f Mon Sep 17 00:00:00 2001 From: codetector Date: Tue, 31 Mar 2020 15:18:11 -0400 Subject: [PATCH] two more benchmarks --- benchmarks/new_opencl/convolution/Makefile | 44 ++ benchmarks/new_opencl/convolution/input.bmp | Bin 0 -> 44922 bytes benchmarks/new_opencl/convolution/kernel.cl | 54 ++ benchmarks/new_opencl/convolution/main.cpp | 261 +++++++ benchmarks/new_opencl/convolution/utils.cpp | 180 +++++ benchmarks/new_opencl/convolution/utils.h | 11 + benchmarks/new_opencl/transpose/.gitignore | 1 + benchmarks/new_opencl/transpose/Makefile | 44 ++ benchmarks/new_opencl/transpose/main.cc | 387 +++++++++++ benchmarks/new_opencl/transpose/oclUtils.h | 198 ++++++ benchmarks/new_opencl/transpose/shrQATest.h | 238 +++++++ benchmarks/new_opencl/transpose/shrUtils.h | 642 ++++++++++++++++++ benchmarks/new_opencl/transpose/transpose.cl | 108 +++ .../new_opencl/transpose/transpose_gold.cpp | 38 ++ 14 files changed, 2206 insertions(+) create mode 100644 benchmarks/new_opencl/convolution/Makefile create mode 100644 benchmarks/new_opencl/convolution/input.bmp create mode 100755 benchmarks/new_opencl/convolution/kernel.cl create mode 100755 benchmarks/new_opencl/convolution/main.cpp create mode 100644 benchmarks/new_opencl/convolution/utils.cpp create mode 100644 benchmarks/new_opencl/convolution/utils.h create mode 100644 benchmarks/new_opencl/transpose/.gitignore create mode 100644 benchmarks/new_opencl/transpose/Makefile create mode 100644 benchmarks/new_opencl/transpose/main.cc create mode 100644 benchmarks/new_opencl/transpose/oclUtils.h create mode 100644 benchmarks/new_opencl/transpose/shrQATest.h create mode 100644 benchmarks/new_opencl/transpose/shrUtils.h create mode 100644 benchmarks/new_opencl/transpose/transpose.cl create mode 100644 benchmarks/new_opencl/transpose/transpose_gold.cpp diff --git a/benchmarks/new_opencl/convolution/Makefile b/benchmarks/new_opencl/convolution/Makefile new file mode 100644 index 00000000..512dfd0e --- /dev/null +++ b/benchmarks/new_opencl/convolution/Makefile @@ -0,0 +1,44 @@ +RISCV_TOOL_PATH ?= $(wildcard ~/dev/riscv-gnu-toolchain/drops) +POCLCC_PATH ?= $(wildcard ~/dev/pocl/drops_vortex_cc) +POCLRT_PATH ?= $(wildcard ..) +DRIVER_PATH ?= $(wildcard ../../../driver/sw) + +CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-errors + +CXXFLAGS += -I$(POCLRT_PATH)/include + +LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex + +PROJECT = convolution + +SRCS = main.cpp utils.cpp + +all: $(PROJECT) + +kernel.pocl: kernel.cl + POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCLCC_PATH)/lib:$(DRIVER_PATH)/simx $(POCLCC_PATH)/bin/poclcc -o kernel.pocl kernel.cl + +$(PROJECT): $(SRCS) + $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ + +run-fpga: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) + +run-ase: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) + +run-simx: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) + +run-rtlsim: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) + +.depend: $(SRCS) + $(CXX) $(CXXFLAGS) -MM $^ > .depend; + +clean: + rm -rf $(PROJECT) *.o *.dump .depend + +ifneq ($(MAKECMDGOALS),clean) + -include .depend +endif \ No newline at end of file diff --git a/benchmarks/new_opencl/convolution/input.bmp b/benchmarks/new_opencl/convolution/input.bmp new file mode 100644 index 0000000000000000000000000000000000000000..e1b6c6faf5dc7ac7b3d8a76b9a8177726f5a6bdb GIT binary patch literal 44922 zcmb5X2Vh&*mFN8=Q4&?DE>Y|i0MQFkiQWj107(!8Nw9ZHqNpy*a!cZpIB{aR#C8(L z&SZA_?9TSyd+)vX-Pzvz?(FZ}_aG^m>^JjWN+1Z3c=vz$IrrTAQl-bgpsDe=Nz?Y^ z*#S-aIrXhsy)HleV!!@h-jn~SeC3}0=CA(o;E(v#0|yRh2M-?94jnqAH8nM9hYugt zjvP6n9X)zfYi@4VjvYIu9Y20tJ8|NKcJky&?bNAL+Ue7$wU(9^?aY}o+S#*bwR7jr zY3I+M*DhSRpk2IpQM+{Ml6Lv>W$nt9D_U!7t9JG3RjsY9O>1v&*E%{nw9d{>t*fg` z>+bH>dU|@a-rioVudh$*@9)zpZ`eJKxbh{q$4qyWjn;_St8jX`g@ox%R#9eNX$bANw)w```b* z_TxYPwmrW8-BxY(0=1@{Egag`c1z{`^~@kH*3G;xBM3E zxBk}Os{OX#_S>}I{@Z`M_B(#Z@6dkd@BE$G@A_T8OZ(lw`*&-<=lA>`?f3rP->dz; z-}n2p-~ao6zxD_Iz#q{5;2->h+8_Eue@OeofA|k;f8>w+5$%uu(LbvFu|M|5v_JmG z|G4%i{=}cq{^XzhliHv9Q-4bP(|`I;Yk%g?{2A@f{@FjP{kcE)=d?fn=l{I+7yiOu z(Ej3I{EOOO`b&RF`^$g%FKd70ulyD5um07)s{OUU_Sdw({@4Gy_BZ~<-_ZW%-~5}} z-}+mBOZ(e@`)_N1=kNR-?eG5GzpMSdzxVgFzyJ6DzV;9P!9URc;XnL`+CTb7|494C z|M(wk|Ky+i6YZb=(|@Y{vw!x_w1583|GD;yzxa#VzxWsbLi?Bh@?UEI>RD|3UkY|M5R+|LH&dC+$D~=l`tzm;dr# zwEy~F|Eu=j{@Z`k{`-Ib@7n+PAOA!9pa1iJYX9qh{V(l*|L^~;{h$Byf3#ovrC-vH zw_fgQIeYckk1z}ROBe?+H_wfuS5e%qZvOXn_V@Po_jY!6^bGWw1_n)oCbP+c`kF1a zA**%B<{9FN-8wYnaPya^G$c8Ar(t3KC&-Rt&xU5+8M%jBQRb%4FQolouS?(OaF>FVt1@9XIquvi8MEklEYCSVS^90NAHtvr00Ry#kNAjyyw zQdrD`{ZQEi{(j6RI6>RnJJ8wQ+tc0K*KzquM^E1Xuk?1d0w4TcJv|Vh8-76{tZzr( zpqW1(gpB?DeZ9|t`xT$+a&X|#mF|v?PCgQvLTM~`TSu2n!!%^Fnk`myPmj&Tq-_IE ztJ&%v7;;#lhsohUAZ%c>+pT7s!|if=9G;-t2O+2X2b~%=gQ1zh{t{^zaX1=xM#u+SLnwMn~%<*xv?K zWQIxy(14Ei?g5L!Z&EryMDeQ|I`7r*9=rt0p=eJZpADtEc&1G)zZCx|Lw)c_*!E2Br}f8du696X2ci%n6+uYaHii_^2i` zI~`u9&4UVHQ=oy>I?&f|k*W8ibNz$;=DywmIDyjl_qMgR^}q=z0RAhjtyeoR3JhHa zm~ux4S}L|wWI#mV2eJJCP~(T^?7#dn+^VA^5}M>@bDY@U>?v3rwxj^5qvwA4|`#; zS_jR@0sol2-C`aZ!hT~1OauKW!eF;pv%dbmw$}C@?BhV+AolM{>(#c-9$rPwbqNqB z(8u34B!J)iQHX!wH8AV{^jF0hHn(8>!8_ggZT)7 z&F-?>@NX8I!)bQZwiXtqW9A{Opzy(F#SEhRRx9QaDmYLZo83y3i?PEW@*$W)Y$?)U zwOQc4jlZ(MKujP;uphej^$&Jgm;p)%{$BJ?oLdJ`H`c$GDZ+o90%8RFpe4byA^~Rb z2eAIg9~%4@<}>(q=wcW4roSI9169O`OIr|rp7$erNY+5VMZEYB6L)!$W``YBc8pBU z-q>Bfy|Way+pr2Is|83FyEisooy`qmCgHx*fxmF@*LJIjg5B0{f>aR3ZYHL%yAcPj z(~s^UXm|s@4-6uJ@E`qe?GUpduXF-miN9C_NI(}&4mC^fIt6L;a9lz2iOk* zj$IVBLeUhZxd66b=zu6Nv7VkmQA`VBi$((+*~STYOQrQ|TMsriAKu=mV=+s{neg?NL4?HSa$0!-6|&ha7L&z9JY}&0(PEZsqG?_I_yF^u zh4^dG)ZKa&s#w5p>gy!qY8Ua>&7l$ivD*XqS)qVP5k~MiZ2ym^;MGMy7XY3U0-d?s zMr6^~*LL~*rEW1zd??b1cf+}hZIU3+Y#nkt%tJPZ$AY%_<`?$w-F$d`|Ni~OiSAwu zx!Gp1r>a{Un=d@RaeaTfuk##9t!PvjG|>W9CBW^lmNL1Iel&>-=$FwBJBn;?!D0S5N37flm`k0k_m zU&qyM>=1MTe_I>D4(u1vln+G$u-d(7Q-c5^0ndQ_YMzPplMwiMdbIR-0-@lr4|DAs z8Hq-MKBooKgbl-3xrc+1Xe5C9>bKUmZ|&c@_uz#)JL@S-QrFchtpla`>-)Fw+<)Qj zy<3H0!qEY-X^1J|DnU(W2l0Am7uL3yKwsqDFkWT?AtCV40O3G?FS0RYb~u3FMtB2$ zv$?N}YC}hNAD^exR45=(K(AyVLIK?h{{O?TU>gr22kProw>KCL`h6aUYj`9Y9SQke z1VJXuIPsq^7>UJWp<(A>UsrPV*1g+z?(A*u-3<13cU(Go^!UKs^5*`XyAK{cxW8F+ zW2R9XtZBCdDw1M#b`of+*tM%$f;eTxka;n&Fpuvf0=?ZB-@zffr5F4?2%b1%#edQg zl`AMaB?5rYw%|>~tm*_9YHUBo>cH{NV5K)v9Z|t1FA!uU@zI^|W6&)^xnLG_`#F=B)=WJh-(IA!EBh z+30fX<@0CGoIOKT>Fk+v7q7H;bRq-Yh(jCl(0Uo84t)r7dwW#;kNpQPS_0;)?LC8t z1+)P_f7sK9%qR)KQY#bKZfKUkw90S~6j!et0_{o!alnHcfA%zf?tiBhF9S()8)WmCfAScD21vY^1mc!Qa&@q`@5En}h&FK?B5KjHeqQ9e)LIgoQ zQ@<@gsGqZoZ#T|&m65tQGE&c88iPF>*b+Wm|gsBhw&G0{# zDNoPOFD|a6Ps@k#iO*d@zs^BOa!`mNZyszqeDuVrmX^~ej~#6~bl_O)fXhSVESPMj zp0?Hw!H=^5KUGFif0O}psPKyo93;l1W@zYu&8h&(h1F2pS2GZ(az*_e^}0OBOX{Ee zx_YLa=)&wrLIJ9iD=NH@Dsl}J;9M!A_acWP=BktRIXGj)3tgzLKfF?;b_yz zfn=s!sm<5t=VQ(4V#Z~kUP3MdcpbhxZ#vR!;742z9B(y|Fst~Ps!9(QZ$P;UtGS<0 zzq4ENB)${{;J@G}`2s%#6A_@G+5jq-m*>xhJ|Ttk`PX&2DDa9L$2&ctkw_%y_jm&d zh2QJIX~7k$7@=q)1^yuTulh?9Q`MQ7`trEPimSTV+|**uTYZ68o)bB1Ofg|%pdrbIh2@?NWHJmEXk)3 z0QeHLD9m!X|JP`|)A+9DD6=!Bs5N)ELy?h4FyQe9lc`iZ7WCPN1V2>_Zzz^Xr<16J zt-m!?o}8GdOxG92-S7wbKYYfV$&OCJ|N4CRh>l+W*yrFk$e@20{3lN;`9IQh@W9D7 z3+M$unU%Sxy{!vBfB_Kv_>f)+B2Wjh0mS5F0{vzu(gFp<29Ta9{wpm1FZ@5^b*5rS ziV)>!n+yE7J&!k>1Z+Iya}uV4-{ta!!{w>T$@1iMwdi+P`#UcjJ9>UF zoGwn!&erB;1JC#`#8t!iF7n^d|3illH=j5S{u9T+fAD1ckcX;22EgUCo4VWEyU7;? zKk_d=Vo>l)s6?o%SG+9D3krw?kXj4>4T}E=oImtZ1GFxs0$qvCk*ki-2yUMeMmWV1 zg2F#!LoQwJ;YcE#$)=(quYI5`I61|N!Nl~;SWx(L@%YgzreLZxQ>)F+R)Q_MM9D|W zjO0NdBL5BXKXCBS;iJb-o<4o*M6>Y!bcYrA5CAwL|LxsW)`*P+Kk%(GF~qL>j$ zX0o|-43QY@h*c`($;pXIb;54}|E1&2t(I_RVs@rhuUAJdA7UQzc>-LYl#W`gzrla( z-ysy>*ol)k!^7Zj>9SHX^HD+YxP<>565fH|X6aY;Kmz8c$)2(RgEl2E4FWjra>0gh z$<-NZ8UjzIV!SQS<(;RrGM@12kyb|}O4t+d4v%EAnRFuTqw;4btM_>P(Ns2=KdbZB`BE_=Sj9lsv{wQS@?nkpfX~goelw| z;}?L>hD^lxVr5XDRpVV=*N6PEhWv-YkB#$zKbuZQhNYP6g)81b4EBScn9$Ujo~cYt zPvZc`T^5_U>(Z%n-DXc_67{duYpI@Q!LMJ__;djeVl|8_>I67)^a$SX(1C+zd+dUL zSVY=B*xk|DN2Nf@1Qycbp58&y1s%VnItTz&8Ri8AFapYHIKiaDX@F~dmsb^Rb!B=ui_2v`xUL64~qY5BtH-I*Q%Y z)!sEgshCos4gQlYfS+8qk?J4-gBB?P)`iHsQJl+ zw5BVz5IGJ>-tb5^m(9X|H<^zYu^bL1vMi!zc++P!m*;0@rmN+t*`gFRx?2UmJ5jF9 z*5?*#IqMl6zf44Cqb|UMP=FybR6k(>z%Rlt`1@Qe^!f!q{0Beb9r{l#VxWf#mDEx2 z@gf350TKoeIftQ_Apv3oK_(yj6k`MWbKhk)&yA;3Qxnn-9KLLeL=%GF7X@uDH9|e# z<{b9>1%GaIRPcxVc6VuhzEZ8#X6D9S;E;Uh@DBEX^Z{Zf12N?xDKK1jer)s+ei1vARQ1pmNSnU08nB7l`*Gkj45 zP;OIQOkManh(G-FdFtrwQAE&L)pW@kj>V%9KlpQkKcetc)*r_G?Fp?TA6bj%Ui3}%86Jyg;71F)DhXPG+M<=DI)Z|2UetuzoaXQki z9DqIXL8Slau@fgw9wYoYbl~tMlULzK0Nl1d@Uw6%HFM-&>a&y{h5wT7 zOIAQ#z`^ee1(XPg1Q>V}>2-_>q5`N8LUSCT0<_ZPy z2LwNIIjry(1%EUeE>^L9)6=sH^W`Ag*xS)FV1qQP2)v7>bbO#Y1lIh|>00$m6to{D@^Rni-{O zCKHdwVyP1DuQD?`H$OQUbJ#4s?cEl~@OU9RQD0cB&(*TF)5ID2XUk<|aAo1X6h#>F zg0TN2@h|v~2!6|OC>SF52EVnpvuBV6H2BZ5BPCC2kSv}FeqIs*pelh_LIA%IK#v0i zmOc!+H-40<$ZXV%geLM--jt{6pL(htICNoXguE{ia{Ho6+hak&A7EOcXr?$eUIc$4 zPN2K6MEXBFJ3W)KTM1UXP1evf?w~$9Lj@#jKi8x_SHFUQmH#qu`8xaqBL2ruQvW!4 zycq*{wAC^Ud?@Xw=+@WSW3qd|Z>Oh%qBrUj?xyfp9Ea93LOc#zzvV+|*2EVR^nX zHB*~UTP@)48nBF1=j(Ixv-OpYm7MqLA)UbLk~Qx?xL$0D|A6An4eB#N&@3iz0wX z149w#;CahPGLuRKoSrDsKAIT`u@>MFzC_~LVrhIVlZYkL`I)&X-2e0}!A{an0jRyR z&z7H=pPwWDUzncE2RjezpUTG?pD$b&?ko8hdBFFR{+>B|1_c2Bu{Nup#x|<2RG)@= zyLv57FS5sSo0V7}@t0+0J=Lce3ju7Ds!5k=#V}(pq@OuLkg~RdOnBspdl^?4u*QTcD>dUJOa|P#+x#w!vpgC7t zsMiKjmllP!H~pP9pg2 zM*fNQX{RB}k+o$6K!w3}N|jQpL-J~lE{yjO4z?xx>%ZDSzMT&nxz47anw09(Aj3P+X|BV zuC6RCEw4_8x{n$kXnZ(V*3kSVNaYpFB?*iHHFmKJZJmT1mk3_;s!dYz0qecf<5>aRpg%gM({&ETZkCPZo)aI8K=VzCf7bnwh8>`k%uQxk2Ge0}OxU{mq zR*m$*e|4SbW&rn}6Mw2-M`_r)aPfT0X{rAl@304H>I{bh!C|+B9vrfJT-&>8aB-~;i7p?g9YeR7I2jTx%%X9RyH>9;ur ze=!#i3jc#JW+YW88Tj+5;zVU`er{%JYJ5B&rb1xz28NTPganh5mD<8WHQG-G$VHTl zsjGG%x?psJrO=Pq2laQ5_xqlXWk=yXte20tNy+tS@jN8K=eWiAi+O=en-l>93R z5FbdhfuG+O4GewpO3)gH&%7lr?H zbqM}wx=@-ZBmeR2==fx1dTy>(p#)l<$go&MxtzR#`bc4Ha%OgRGTwJYK3q6pTuHt` z13dwdi5$fIooqRG@#2MZr;i)>DLqFpX{@lYwoTSe+Zd@mi_oY)wLU@|`mI!{3khI7 zLhu_VK;mXWsjwOkY6j{>H5nnqb5qiPo0+Id%AfEI{Hbgv>9g3tUmD9%0CYKhA+8il z6-yHnx_2v{{GnUwrDWI{dXy_SJ+Dw9ok&GvYqR6_p4G%o7+(?x3D=zlU_tkChiGFP1} zWQwy(Q~odwVBX~PWGNRP$xkh;uT4dIjvB;Pm*YA*_ZtQP3P|*O`W%ad7tXeb{GaZ2 zhiT>(iT1lp^b}CtkY#u;-8QtKI;93D0-$REL7>790dxrv1pqsjQ~!Sy8DtU-?EI3N zpuv6Po2Kw*vk8ycmMAFxhkb6>u++q38HGQSNRE;AOwUXebE#ZuZh3Vknu!d%d;f{@b@VEBjFG=r@`)Ci%XW_ z8~BNZX)9FaZlwU^Mcz;p{v$BT2uc*7Knn$)4FyyE$TMc4=A?*VkfJe1P>Qk- z_(lGO|I+Uz_~V)41pLn=GNtLc+IS{gn5fj3SJpS?^QC0aJCd%itSl~7C#NSX3k&&y z<0=6VX6SMz{8#B$!w4Ue;=ka(c>c_ZqfNA4c*v}xFq9Sc0oJzN;CDKyJd^Fwa}57g zsf#>U)_cGomKAzYfQAKp2K$GeDJl?ftJ;eYK~3t}?z6H}988KS?5 z>g>Ym>Qa5azP!G@vADc8lN&A453sSaT(2)JuCLT4yyp)I(dF|SG*H%GIbn*ug8$gb zGv_Z}M*dq)fd5>-m&{Ta8ur=gxw3lwxL)@#{3q6Ry1jamt6~7jgIxYd3E&;$nqCj0Ep8;1t8^5BR>qf+c70)X zu2x%G-CEn)*q$xdDzmHGTPurmGc)r`>$SiP%AMiz`^p5=dY<)fbHih5grx?Nt96hHh$J2cv_$f+6AeWc`r2!HG2naQF$of+u zzxpR@Bf=79B(qXaLvND6TN|SE|(g%!2dG*mr@W-t#p21?4~M{#cU>*FW~n{ z{wlNerPYm%t=*l~1;XByjjhd<#rn+j+(Kn6bVVgVOhK+>3<0kPVbI_Z$^Y4l^tB59 z=BB0#;E#bw$4_gM&rj$H*WGknVg7XdxLUPBEbRy*f?p9pDL^owgV(_}ow&h4aX?G zBLE?Z{dF~Uz%B1n6IJrz^#?$*SB`}Hr6*c*Vhn! zc)v*gGdmjXJtlNj7uT=CHB|H`tQSRSBKbdmxs@g$@E>hzx?~Q<8Q=+dt-XC_hcBSx zchM~C@UR3SDUPc2sw~JIq(Mh1fT#d@0Gq>%VZb$j>$8v_bm#(KenTPGUlg056Q#L( zm{p$iu*DIF|6{pigasf1;HcoQRET&+%QZs0#g)ayh1yhMbY^W6_U~-6-)nw)s;csz zjm2`-+H%Oaq7GkpuZzF#garTT^Ovu-w_Uk#<|OzpTLeG+2mcWGUBiCD|6%lB;fMdK z%q7c%vav?m5#s0ze;^VdQSgtDf4R2ezC2+D;N($3Rb)^!cm1l0#Ov^ zvXXx^C?GL_xc>_@gtfOS{D)hI6#ru*lpHL*WdGoIu<`w+9slKwlSRK!&njTGzbtFh2x317!0w2 zdJt^j2agQF_5Af!{3jupnwS2Lj$Ee4GkNJqyPGU7?j3N%its-t?QY;F`_C07E0u}S z-1tP57-xRDR?cP9scd;}X=(e$t=+wAyBnpre>gr$ZD4V7JZqB@xO}MkV3E59df~pL zKPOu*wRLuNwO>AW3j9}X;RJvX=b+bw`}YKf34a7XMSnN!ml~H514tQQctqGQDxgfD z1OW>6Q|N-;xVnOA5JazZD&i;_=sKZj#jiD8b)~ZTxJU3$O8hh8XXlI9|6Fl$dU_(C zDU_$G<=jNAM13%q7#p9QsqgIFy?uLc@A_n#h?FwP^ukP~98_z7`XqGzi|jSA)~kmA z^uC?B)YjG0-G2E3@n3VhJp%p&+zh(e3qwUgc2KeKpzyQz`%8|Jdqy z^~yQopW~f^UkCvI2YUzYzJOHTg#Q#lJ>XZBaakRgDu@gBPcx>N00|l)04_jbXgu&; zo{H8p^obbk7Xs+?khc{*)GNGp=&~c3%f)?WXS@h^vm*bjv#RRn6bt^jR2DZt^h5K{ zRJAfw%@%5FH|{-p`HNqE{iXZcGleXLp}Fe#Xeq8rph9=!v!AyB%!vH=#2EsBt5+_b z1^f21%h{I67}%Q?cpiOES;ekx=-a}(5iR`%~b zdhM;Z-~Q^CUwbe&RW41&>rzf5ZYR{7r|OkDqFx{C`U3cdFYB zLzCjr1I_{5zduCgDfo#3G5-O*#;F_t_}!im{ke(&Dh84uIHa`LfDaL*zwGF-k0dj$c@-_%4@J@aZo&eyqWN zH4(!W2;K%}aRNF?4jec?6whTMjy^}Mfd1!_u^?I)3ODeh|Kq~{TD86~H#;{wyLj`Z zH^2JTZ#?1LfcM{h=ZzQk5dQk|()3sm{5p6=gogY#?7y_W96f&G~M9NSAF#v);OnpaJFx{qMV`;Icze@T` z1iN5Vs1;Xq6QRrjtPwF_96W1@U%n_1Vd>P**c05UMV6G4;`g{_6@LvZ+>o zWiJ5pJJat=BvWG1qkdc8fK~Qku$!3mPp5Ch?#|<}y_oXH3zo;J1T+ay*xc3;)9-=s(S_)1_=WJ6ae= z6J}Wcs8r{+Zoc%Tx4-t}{cn8by^lV8_sQ43@}FKof%wd^@VFiT$4f7`k z0Q}7?ezW=E6xDz55BO<35I`ftwtjK{5%wX$_F?dg90-0r1QPq_9+q`_MF8ahqz)E_ zqlb>tQAG2RfKl+clBm3pK;hFL)l>a}cO(Qd#=#cTNG=_A3^<~rQ{blq+T|GL!=w3$ z>P&S!myyEf%-l3#{#<=&b8GAVo9_Vqo%cTZ=z|a6d-C2k6#kULZ>6&znG1oR%}9j%;MXgB$^)on0KbkuiT*43 z7ybjW>1cCvGuG}P7ZT~GyApylAEv)vO$4hbyw(LmFbid#8wXDfMDpMtw2#35G2y>k z@P}iglPq|H9}6%wLk_fDpQGZp`r!4qpM3MfC*OGgqfb8i=9Bk6c>9YFwiagD8Zs7Y zKhkudsmU-hTvl;H5kQ`U|1_(Am(HJM`Sb9h^CrK+&=H@tANP+xA08I`u-i)mf~xQ- z1E~0~@GAij{1^cE&!-*%zBJSdYL2?$l$(>MXt6ndRI)x9x&shtkl5Floc@-2JlPk< z{yD)v3jVybf;jz{7x+J;<4>eX0qe^P)cIDHZoKr?yYIjEwf8>xf&Vx?7DIsRQ}Qn! zP(?tp3KEDW)1m-+45)ITkgywOAH(J!$L)dpbj#_MmNV#BGa_NI`cNY-XKKdeo~g%y zV?9CeyUg|o_{ne+VUH^qi-n{4NgY42e}R2I3v-KGtE<-@zVhbRzVYP!?|k>O&p!U- z;}70{@{O;5>A`AkVRm*R(R=zpGukf{;Gd3P) zQ-41H^z+ZZ_ucP)^1<6*xV5yjFgu-dwzM>~p9`t=_h<4i_*>4i{CEDGIDn?hLm@U? zAkgu!%QP_LmfjXWeJ9Aj$D`v{5s-o37m6eP@Lybj7(l_#rxR2_1L>%448z_kCcyIN z2%scWpoy$Wo={Y`ipP9 z_pR@K{=Fan{&zq9?EByU?DOw_=UeZ6<AXZQCdkF({{ww(>{NpQ}P~xJ)7Xr|hEBus(V(hNq zlq)N|I&+^ve+rga!FxRhP|qpLU2!JUBiC;a?`~0&{zYG44-+T2&y*^tTx9T-;K3E1z+!Fih_WzKi zzbDxdYT!S7xYZig0mXyhw@Ggg_;CWPu6uER$UpW^6@djm;UD}L3n&o~Ez81x03Q^e z$J)CLk!@Y+kO5wj9~IzdZyUeFTl%}q@RB2ujrpvWV0Nlf9?hg8!@g)bF{1Dz|L{MS zpQ*2`t}d@{?A(3nosU2L{IkzK{p4HU{`jMhKmO#~-}&UjCtu#H*Xs*qN4uEqzB zCYg!|qx={52Y!~nE~)m1ro*kaNG2_K;sM9Npbhs2e&TM~jO-KsD+8bcAQZymlg&u- z2O)sDYm&haq7kaaoCyeic~oEegCbG`KkuHm#j|n0%@W8~rpu$* z^vLjVG$Z&W=N->y($Q#UVqs}vWov6|XaALNeEjWCzx~NaA97yUhaY_O;fL=(dH=(2 zytFjCJUsxkm*pfw$9utLW#ih7LnU-2LPZ#r_-K9WvnAVAD-A2ive6)uSQ3w|$Q zpsa9;|C0nr?7wWr(&YvC2!Is9fu@y=r5e-QIIR1^P}Kz*gv7p zITD~`z&bdbof7=)02q#ClY)N^`R9drJYA{J%`R?iZ0?zhr=7{|%|vi6E+gGVo9Bun?x}z?JJ) zPj&G8ostP5LdzidgVw$V{t>3s;qF|$9>1X0Ko6ecn|>V~JC7>-1^*>hza{<^{4u4^si@D^ zZ*c}9h&$|8_FuU_(SO8Whfn&G%A_+GKotS%4lpPpuHt@ys}HUuPXxHa`;77WGZB6H zGoAVJ?DRl5n+n?cJQ?J_kjq2^!4!Rv@PB@Osz6gXy8uhGbMSv@|N3k1y!+vkx4-bx zqlfqJ-n;wY;lmeSdgaY`-uc=q`_-jdwpE${51lroPPo$;s`s1lSMZ-XdF;rc<}P=f z)C!S~daZ*)ZaIZO)StcBKB8X{|3>^L6-Me?mY>0u=s+{X_m)|5lq3#axKPFajWUEsMjK&u^mUAkY~kISQPbfkggl z2qBcV%bRL$iU4YEbU+XQ3;wC;3E_V*k}qasi7_&M!9SYGWyfacX#QVZyZ_dE?|${w zNB8gEx_R@)wQDzS-nx6|{!3qY82+pEO#(v7KjQwpeypW37Z5;|K*atDTlsjU?FzL# z(Dfoz=NaFXn2WG0vIu%(C_PbAJl-A1Wh1tM;T-%2e=HQqvksUTpQ#)C&*n1(__LL{ z`rcRHd;9grcW>?O$@=H!=FW|4H*dZ8_~kFX^2&0#>OXm~%R&BRs1K{5Gn%}Pbh{) zOrRJ9o~bNAAvOd6I`EE6O*j5LKi1{X=Oea3e-8c^+3yyPjMCT#|EX$?<_lE7G80k@ zARpR$`|U5j`uOIxoz30#jkTTijn&QV8xQZj_|mJ7HmfC9^W`(b{pO>_HDqF9pJaLx z{!92L-Ct~fIN9e*WR(Dr{tVdtY%=ue_yg3wb?Yw$P?3KfzaIW5_Uj2iMA<(+Tfd0J zK9cn6*HgmBE+ZL0Ro~Tf`E|t_gFy$IJBD+)h|Ltti~NH>6fKr=3E}_jL}9czno4Dh zY<-<5SEg^i{`%t=U)-VZY2(_B-CH-e=mFWgv-je?$G2CeOU|BC>?mzHM|nfIA@h<4 zH5C8Bf0_1gy8jNH>Kmr-43?%sF4LeRzz$>|UAII53crZIVg83D`=|O3eo+@A0FK4b z|K~(ZF38Xc4;4hcZKhhOL!P ze}*6TFU`M~E??B~xAgnzJA(j8(m#_kNb#HFcBH6J$&2tu**_KiO8Do|@r(V#{80i? z@*h%obowf6z|}7%~dLsV^drAU%tD2e_?U1wy?ReG*zi@@7~4_?(N@Lnji1Gbn%R8f{=?fK3>0w z+WG|k%a_1U_v@jS0Y81Wa5PE!XLbi!{GjVvihATcBL7d_AK@PbF*YOf%Lst%R#N!c zmn!hkXD%VQ8~+SfAX2XrK#X7m{?lnao#5eCPi{2qbVQ3&RVDu;Z1&5=Q^Nm=Q8q=; z0h`Tp_Qk?{ZgOLH@8R~^c71uVTB^*h@7=k7@4;=3GQPRLGCuBZZC2!$nfx#Tg#Rl1 zzex8l@_%L^kQDJxCrJM+?oi0*_Da_^MG#hgi3SY$XZ=guzb6_}%kFGG>~O`#4E)j5L^+TBXJh+Bp)gJdcq*SQ zO%^LNGc_z8%?~$k?Cx&NOt0?jKe~Va=Jxfyd-w0IO^@{-Yj9t|mznARxTua_bw8gz zPWR)P!65uc0pdZs$>I$MIZjvM7x^aG{O%=pE+2Aw5`w=-i%TpsSVcg{YMY4uf6vAs}FA8zI*rPz5BQJXVTd&xti#_h@D<$kjbbaTfc<=YV{NO zKW7SZxCRmc{z0pcGY05yrn#B?k1Cj;m-@ei|0?{`-YX=QO0 zc1Z`wha?G7oEL3Z(B(lV&(oP5y5bT1zT`9fQxk%}!T)@8Bs)qgd^wY=PtVl1w$@g6 zAHVTtZR+tGFYauuK#k40$%$Gv-+STg*)ylu->dew$_4muT<37JO8+GMm-O$@Iddot z`?Hx?0Q|!oJ0YtsBLBoeO8Z#}Q@flD{a5L~5I_|GL;%o#9hFXDc@(w_-_@YV)W9zc zk)hx#kNivknp%eo`Ji(+t;;{@Zf*+s7yDl;7mD=6#>mw2bib4q>az=r>({Sczw?FH z?^ZMQomXyd?Owlq_x9G>;!Le-zjR6*FlzvqJGIIAnVd=WQ>A|wNPob8f%2!a|1m$u zMEiqYr(ITE;XnAFiSz z`LAZBsMOpYDd&T(V45mF_=&Cy72W>LgTF|JPb!v7q*)n97$$4=rP-;u{oVP!{f*s~ zsl_jTasTel%0jg=IhtuZc<{uT6Osc+EC41j4o(i~DUi~-t?+r?{to%O#5b$vT;{O0EY*ZFUuB*EM9ktLwegPZNAPMAA z$^u6iIwg+x$0qV2cZfzO{9h&!PmNV3^6~ii3|$=s*#Vr4#F8m0!^OGz#cFk`UafD{ zC-1!W`pxRp?stFY^*#Dv$2bL~(0}&g38}Q;l=y6px?xG;1Z(jqB3oaHf6mJ0$5XQT z4g3-KPjzv`XED2iG~Frw6ZWtt{+aog>|eqkR{xd!qX1I>13rf~fS;>@68Ip0hMgCc zQ0UTNfLD+OwL)%CfafR8DfX{=BB|UM^{+IneulqPC=&l9N8}(dRsmMFSF6+2#X0uH zmG{2-(Tnqo@BZWmdu!D>*2lJ&mXfEQ+8JRutKFwhos`}FVv=O>U;TXisNlb18xiwI zb=5ayb_e{NHY?lPqzb0gUnTz{|El{1|EIDgfiL)l07m|6$hiStmvjYHh%P2T7klGT zy~6|VG)!}gH9M6HdL!vvQSc|&?*ji5gGNd(y<;fGQRW%=JZgc4P zQRQ95ln0O^BJ1OV$(G9Unu zM(4kfXCsl;n zNco3;Gmnkk?ZSQnzmSUlB>6L{Uq<{Z%SF%RpF^7@{#Ez{o4^;Nrj%N*0Ro@bVTbWc ziUi8?iT!Jg^RCg!3^iZY`KL>HTuV;yC#3#4T`G=`jV7aU@KY2iE-%ppw)4sxuPh{% z-}=^PtN}f#ukAc|d}Dc@lOZzMs9I@P8mX&4>#>Z}XU?2s-zP`U33>F(t!Zgn9(Wn{&+H5u9S09|C+6o*a416B%_#SIzKt%##XD<@)%;YU2ZnG7MExYz{?g^ES2@R)v;5@9X*uq{ub0u& z+1A=RFbsYM-5+dq@;EH404n|~>^MK2{{mmd|7!V5cmE>&97@5)IdOHm1PF`8)pWOQmg8;d1#Y;u+Qm%?9y|0xaf1mRKlaZE{tWR~?8gIy)bgL9{=)z0NH~D}8~SGOSy{m! zAW4HDh=R_1F%4*^?wvS|;6h(;tegtsjnX-G`++|{F{SgrRw<2_#zzyWv0^lstW_sR zcfatZxggGgePO;_i3X)c-+==u` z|E`>6L1Vv-!;np!Z#NX6{6qaW&e4PH`?Q(`zuwj^Dj))&t3URSozU37XXa0NfACBF zOAft+|UXdrii=n(vrzj1#-QXCFlCI?3TrT(u-ZDnz5b8FUn=5XhQ&BwIBzxvak-@ftEb?ezrEP(2)rrYpTi!3^GFA-rirwXyPULC~G*;N5hv3=vgwTV<^Yi)P;_WhTC z`qPKEZp~lYsn>7by1u)!c4K$Fo)b3>XTe6iN~p>PXJ$$mC*bAe|Gqv9V5c>}dKUOc z5RU-+pEyk~MD<&(00CcOe--@e3gq&H=>8$~74R&4$)P!Xr1*c`1nLql1`S7x4LTzN zj(l^Bte7FOLL4l{fkNs$38juoBROv-kRBV2fIk&W6i4H1`yZW{68;ybt5YQoz{sH& zvkMC=+t;pdZtm{i*t`8=eg4jk{kyMz@7;&@9xpCE+P`^&%@8|lYumf^B8SKnM)SEG zwSAg9aZ;4VaE){&(FG_>H5ckR% zARGZ8-LE=!tb@YNrYNU!`as9!5N{B0A>tfC%_b8uz9yubGqgBHMpTHQ>=Cwz&2-F0 zPTKG5?e1|#N3-EjJfDgt3;B3BPQ&v=c{C9(p#NnKfXJZ`bBl}hmHOfayMC^3tlyih zJgBei-hS=T>dLFN>cgF-8+$i)HdmJFizUtw0euEZl?YXed7|}f-%=-8$z{22UssM7`DxIBRfd>8B@O59dPOsQQ;{6}#)x;^85Ih^W` zR?ES8a(ayHpeBG&L(GU6V2c?6?B#U!_N!N<7vb!M!B8$03?=f3k=ST9#xi@Bjqrt} z;(wVfaaqB?K#Sk>mpQ$jge#+5H6eH8#*KsPqQ%AR+*{eX}G874-OZD2Bx21yASir*X)t4tzEp`N%N}RMI0Mx%rVJr~3jN@yTw#WF(r)#-o(iMzS3HMHXD;$XobN`_J4$ zec|qYeRpSTV`q2w=JoZJv75^)ODnf_D{Kb4acyU3^X8*F%X<%Mu>?*1G7{3-FMdiY zm6F#J){97pFgS-;@Wh}cV{Eo{dIQ{C$U6K~{NXdkk^@iV1T-B!yT4Ikx!VxCA{j^) z2b2X&1R5L|B@&_TSb&KU^%Bq=zu(@*UR#!1&ya58?H*DV8p4xeMYioUFd3$%ev~YKMrM9+ycd>ST zeSK?vYwzyO)# z2>3#}d*qH917=r9x__fIJGwxS$5M1VMTLbdv6C)?bMPxbv6c5)mFR`wC*o(TVY7 zIGoO>I1FqgFj5%LX9Yh;UKcaeL*^HjxAv|tudK2qio+gC6Q$ZZN8!!Pt=5YZW2F)& z16AmcV`KF6#N^ZzCk0lk)#(a9uaER_d^6U z`nsny0Hefgd>3)|u^oh@ZMn@2Rs<6w<#jnpL>Z@Oj7=zU_V-MTrz4Rh z?QeN}AoxeKp5rc{Oh`-s5q&F$rSb$YTeIX5#|&W`13Gn2JSu~eI#U)|f= z+uz>2&4C#6^BfAxZlPU;e*%c?9 z(c(raa&!9?Y@C=&;1m5S=PEHZ1l6-Ysb;D6FOhQ`S_?tYbFqNpwk#>~2b^`xo?|i3 ztRo^w7-#uJ9Nr|$JmAk|^EtMp=Eg_a)jEwNgFg%Y`OUSB&2^UjuPx84RFhL2hBKbc z=1YZ%+;l#_xUjgjz4zeuwbku)I-!x930>+505NNDSOOODMiO~+Vx+pPes(`4I9r?S zkHpC1XW?5apTUtN8%~M^jv}KJ1L>#?6V8zTD%LY$wf|A6e~@B^%5p^3kBe2K!a|%+ zm@kQqSP#}xSyksYCP_+1@n}4uDoE5&#UiV11URf%lCAHh?8r!}IGPB5Ji2CBv_UiR(yPI2U3sw50Nt8J#EG#GEu>o*u zd}L~Ai*}$bdSZ%5sS15L*x3mCiTa7Pxi^m7R*ED2DTUDQD*=7Phxrevj&Ff4nO~g! zwph*ad^8x48xj#4m^lxCmS9PvFq8zsz?WD!C>bsH48^QT$pJy<&>S%xL|;)Q>ENVZ zhO7&tDD6W4GBN^lkv1lX_u;6U5#c`}KM7DF&l$LxOn$sDT5jN18{FqMmRGLtZm!K% zaF~=a~!ps*}U)`9OYL7Iix!G1zi%CX0jnH0jAtOix4 zD(HW9baHl{HLvCMrIi&r0hp5SPlYul^!If+>93<=;6Mzxnu zZlsdod}UHYoFP>oIbRCJ<1;vGB}B(FtXKHMoI8O2$d)98L>xwcFSiB3c#}d}xK*Le z6%qo7`3s`sBzKr}v>aqm0I6p{0YwBvn;~i=?*;>sQc40LAq7L8$UUxNxiJ!*VkVU= zjLC`LqobG~IsHTM7jnfZnm?y&3rovv0GyhguTd7xkEX(Irou9e1nk83f%4+&(){8a zx5u-~0Ul0{aG+|Ql$BHNol=>e;ha29(N$hChha1DquG&R>HUE%?EVYU=S8fPU{fd` zg2&)s=za-qxs{rkJH@sSx;O?-;Dbl<0tt%=A&l5Tcbv-BH%dr^5ORiw$~*v02|yAA zUQlTgt#+JXz=FEmb=dr)Cr@vI0Qv6r=XQyj(bMv+8OszII!zx))`c|y&5z4nv zl=Df*)yC?ag*;u)k*jfhNGdzZ`Dq-Qs!oWZeSU47v9`Pj{U%sb7?HRmr0%;lLd%=r z2LtvWgDiCeAB959CP%Gy^?_d`fO98%iB}c>h1UUIBiqA+AOeyRV9q4b5{IhrN~S0! zNBL_#J>qrABKT!xCJ+@=1@ni^CeTS0#!27V@X4xgLHI8_LI{Ltgk}dk0S)kHXRGCG zlyY>`?+&Cgai2=U3WCUFBfdrDdGv*3RxVe8cq{H?E5TU0bftf+tRPG}bXuL1M zvYot7{*-%XQGHgm01*=v`6%%Kvp5Ec0%XL&U&7we=Pmd-@qDz%vd36i#D7Wx0Zuq5 z73qGPng)MuwpvPu0*QQP#2?BPQhpBWK+;DN93xY%R>oz8cx81R+}Ez}?cKa}>$Z$r zxAt%DUB9+1__3bESe!5c|2b=lu0L`7lJ;`yuo!u;N|DB{cs|5kA0#dk1t)y=ax{f5 z|KO*Vp#MWua4JI}mZRhQsBKDdQ(U3A1c`Eq_{bd?cmm1EWJoxPro~zL1%DPVQ6Ox= z;El<_&)}cp{4>r!s&Mkn)YL?2QutqEQ(!9SPZV+^K7Wc6!vS98jFcKN;_3=rz#G^1 zZtmZ@edq4od-w0(yLb2A-8X z3C<#unG>=}B+0pi7&Kk|{c_(-Sybui>F(+7?(Sho9YlN~z%dL^RJhGwOS26{IhC3! z@ajQ=0V@8Z+5@EpS}c1Q*J( zyJ#3WR`M_Bx?s+g*Fq^}bx&lJIp@S(qK@2yA&<{g(yDSlL;R7C3zFb)qd0|Jq96Pq zfCM&!09B$M5q9V&=yOJ07NsYLmLN#7Ar5*SL+&Ns&X19@q;qt=mP&Ly3H}0YG32t7 zlGsm>*3OXHpav7^pnHfzlV)!`0QifKUy||orN@sR3w(Hw)YB(-jfxRHvJ0d{bWTW0 zluNUk>cj5}{xCAi{f9UR0;LqhGNfoQPFz(k5m`}05cuGinyN^v?#~ewzh8>mM7+uZ z5kM2l8j2$D)lkKFRgLFI2&;0ckuZnOQkEu2<5;aA2YteSq>_a`~3$ab2_v@*K;Do zh7g=}E4H{fiDQ{fEE0`#{1lv~UIOHavXl=nypvQ8CQ4M_nULZ?>k1x${yTO5zKv{;|#%wTk9 zYC_Dbl)*&dMA_wZ^^BZ1k5YkJFjqvjgu0X+D&UtKNbw)|qPN0+R72EXKl}^)62lCO zGbiAQNPUQ)U(`o%3#+-Bij*X_)I$P2Bas@RWFvSA3kR9z#q6Q?87v*5Y_9lFGNV@%($HmZDu zN}`M$sfmu#gxTVDfup{t-TxKrz2VTGt75`HX z#IR6B(=jhBP&!y#kWr^>QUkvdHVB}zpV<@|>My=PSAUUXSzCbr68h<}9P%%gQSuuJ zAqYx2Hl2__v05Ojdx^vH-d-GLFJFvqv!A|}e3mIPK~6Qr;xI)}tI{3#6`SCP7(KvG zSEr{bgKp0yOr00ccX-RoTbo$8B|#y`m8lS$1x1N3B=IEg!Jz`19Dw3jeQ|MVd5Ja} zeejgOt&7CW!hdu?@!wEB5zMFj7yJ$D2S~YTDK1{Pk2D0NSyYdlz|Bp3;jy4p1HAH7 z5bL<*JNzFQbOe*K!-*az3=wB}jf?HY48e5C!V!OpnIe!J{e&iq{&N$|rNzmJx%1Mw ztIq69WA?nO!f578wHN3@PwX1$JVnzvZWOY-N)o^>O2)dMM8XA27LU_ba3 z`&HfaDgQ+}g49T^xi;&9^)IgRpM+-$$2FrMDwNKgMvxh zcOs9JD*OoF1e3u2i~eJeu>~ZEVhaU7v>1<=TU$I1esn=3XnuZP*pKwf<>b=h{QhrSXTp|cwT0ooP-YBjA>@!l zcFNS80~~-*;)sBt$RMDAq-fU9KllBt{eIGQoh^P5lRRs9*09&RcZ>dRrHrBfG(@F8 zxD9fP)c{|5H2`AeD2{8j%GI410ez|BhJQ1ox9B7E{y zEB{jeFGo2|j7O^a`uh9&`}+q5%!QQ!B2WM)umJCobhM6bUD6DSnxdeWRO4&>>&rgk z%s(-~!8e?EhUZVSf5X2F0n&z?p5W{c!W1XtOmPx$js$$dx$hHquX1C_HzQ{mD){jh zbw1%C2Z3%-?otTsQ2@PrN8RJIAv`O%_g4HnoSNC$|M~{-S1FiK|MC+0ujq$)`LCq| zYVs%hnft~4gWt#|$J5pL<;an*jvQsEI6{a71`xvFAO^s^$Iy3Y&tIakeB1W^6Y6m) zG|ntW{QTY8D%xI{HNw!9qWdL&A>Y#xIP}f`x&Lq){5G_w#zz^AfB;F1sDOU@;oR8^ zoRod-#^l2Dmus)KUcXV@f4)07_`o`NFM8o)0SMr;`pfgP(49pAoLO`G?&R}rD?jSpk01Y{`1{EEZ~*?M{=&bvll}+h4}SQ+;m7+s_@s&< zS93p;f4x8EZ}QXJKee#L&zBha1^lF`IrvF?JOYYYSKh+{9szoiSg1v^C4ewt4MN${EDuGnsX!;; z;nEJZ-=|L$J%3a4v4UUtDBs4%{{9|I_j__RR0QY5cJu+qWopH>4c8;Fq3YRvlwTNo~Vr_3j!2AZAWNRzW zmlx+|euudb0WCeUf4kXS0(AGc`_rm_#^m+kW3N7j(#iZ5m*sTg*y@w}V>En77?yT; z1Ud!pr@uS&=wI#Y`=F2Y@33C8w%j9t{yO-b_lNw${lYi*Mg5tjLH$%SN$1VyfnV%* z`RAWG>F7(gHn}>AyK{c^$ZnocfQd-4mS4PFfn3B-%E5wZS0QEW59lOp+~&dlHtj9X zcMJYevOh|$sfXP7bSn1tEU#K)3YnPz(@To+8!}}t>2zc zbdT@j{H1->ukjoGKa*eQ{WyPzzf%9OA8BOBFBN-N`Cs9oXuF*K$GJ7h+lpu!PfpXY zoTHLhAe5R`zd-%$Sfl@S*YeyS&sNq~rCYL8ZgZtF;l4vQym5d1U;Uf^PzjT6aQ6*$ z8QCr#L_9<1yZd&vXzyU_|1b!#IQMjZdE@P0(Y>Z`$3xqX^B;x}`NjBq&VShY<-T6( zQ?YM`{JTf~;!o_iV^QiS^|6s}FZ4Rc0uat0Swlk2PwlZUK4To8vI1&-v9gMwiL*>@ zKe)w7-lS@5O+tCJgYKEHjQ$3z1^+nsN&eCSemDhba^i!)x77Lmy8{|Lws?5shBx$$rzUs{8_f4D*Nk ztA$3E#+l zZc-`$l{%@^#b~ z>rd*7POwg6ACXV>n~DkFs7ve%`J4LCyJ_k-^}R`#*j>9cxzC=y2Ro(2z-=)AGjoe+ z?xe>Cl2N#aZ|cef_YbEoUZp2^hvS|nSSioQDS5Jo9D@%`a^A6NE7rcL&M|jFxn|vT z4DdkrrEmkihd#6~cq8BYj`=F@@wo70{)a;!q2I(;>lc5^d<=e#{76gme(p^oHGzI+#69_EamIVc{iH+;v-6YIWQU?pWyQ0FlT<6?lTg3bX^gJt`0K zX7+q-TK&TB(jMcU=*jt$^(OBz@WaODy-9pq-4pXC$u#s=(K+=w^Lp0FG8TGhzCI9e_@C zy2WsBa&~tuaANU*JwhlkXa|r$>AjR*=v92JX8SNd4S%%1ySukT*7rv0GwyDZa+>&r zkGr-cLk?us&6yh97w|fkl zbPLIu*kixIR0#V-|FQGjPza_m@^qBVqzGaVPZ&-TX%oq4N zKk%#UCqi>Ar!Jgn(CY5Q^MxP$Cr>g?Prv8%X-2)dF(s!aUe^%#mY?kZ`S~;7J6l<) zAw8VOGYhK}snkKOEkS9D1k}Fp)fe1fV;NAp(j=yq<_N%F+nW7PEI=pt zaPU!FrQoLLl0{-X2Jff*lEgbDhXC_qBtEPCH=XNKcO>nR`kM7u(<{_Z&$AL=xu3{S z?^N(U-$EOgNoFBw`|N3}?`WUSHGRFa(wW=;&J`LGM}~)nc;tbk`0_PdTn~s1mO}R6 zR?UJpQV5dMHT*b$3j+W?0^m<<092w^yMY<7zPJRr^(eodTUKPDU+0Fy9X@0*@`IxpaS zLcUP*icQyz-pq8?y+ZW-=N>q zKHNt;_uCVL{e67{1IO3{{qD>Ks^6>Exy>7UpD`1**S=&Wpa5R>DnaV6NPu7N4N5`) zh(H>Y1$YySz=Q@;R)i40$V)z!PxA$U2fgr7eV6jIa;L&xgRk{!kC4LJ}=S|lH z-v|%zRo>Xv)|)YXjo;lHRyJ6GWlDZKy!U~rTRyXka2`7h|(rV{uq0%8E@ zKecefPcuXbWHzOopt*snEG`tB0*EN`UOBtCg&y<)U!@LQl9UJQ2YbrBbUTyv$o##$ zCwXtmJ}NTJo;x^=^LZN1-5V!BK+hx<*d3ya?Q8w$UhV__H%4>@=&XOKgs9#hlLd2Y zjfEhka2Y@L~y}%VCkO-uLvV_16T4hm?Rd^Jyq-4%2^!y3o z{ScoqU*4;FzxrJkqOABn zC-cWCV?l2kpE8IRUACzez!EMH56}gmz%|~o1G7mCq#>9vs23cTfNU5~2s~}!-%I$& zM+pVIcAp+sXy0Ntj&IImvj=%s+_&3H{{icl`f5Jh%_n!K?P{6Kbm%WR_0HsP!d_Fp zn|Nv;*LSny#0jUEC}zhWa!TI>b&!QH2EbGVVEeDnMQyF z`0ag@!N3z4fR8`_cg0Gg_{Mrsy`?_+J_(=N5ARpS!@rC513wLzMGA5{%AL#8)APC$ zar6;5RWrzUI+kfgYKi#J^ z-U6OZs{;Y1=S}%|zUUnw)%fv!C)vV$$@t6l^?lTToD|_d4ACMjGw5Vum|{NH~ZYh|JZK!|txBZ3!SBR|2eRa|#d`9p@P>d(w`59z8Y^)mPoQH8PVw@H(k&15A?PmBHbp?E7JTPBO|3`wp zo{#P=c4fZPjryd1D!#yXqI28aOW+4QH;L>zQu zOd4!)@Q@M247SY*S>^;REdhM2dJSMCVr`$dd_G_6cZr>uN5 z_5(tKgu!fz-~(m?6$dLB)CO$EbP_NW;06*wJVAXHBZ5z?z9v%?t{-X0%kJ4vb~io# z?z05$TXaJ^n7x!2^fy>d`Jm4xra=$uNBb8!9N>bsUpXJ>%lEN;iQllN_8s&Y80hQk G`}%(cy4YR- literal 0 HcmV?d00001 diff --git a/benchmarks/new_opencl/convolution/kernel.cl b/benchmarks/new_opencl/convolution/kernel.cl new file mode 100755 index 00000000..ff56dc6f --- /dev/null +++ b/benchmarks/new_opencl/convolution/kernel.cl @@ -0,0 +1,54 @@ +__kernel +void convolution( + __read_only image2d_t sourceImage, + __write_only image2d_t outputImage, + int rows, + int cols, + __constant float* filter, + int filterWidth, + sampler_t sampler) +{ + // Store each work-item’s unique row and column + int column = get_global_id(0); + int row = get_global_id(1); + + // Half the width of the filter is needed for indexing + // memory later + int halfWidth = (int)(filterWidth/2); + + // All accesses to images return data as four-element vector + // (i.e., float4), although only the 'x' component will contain + // meaningful data in this code + float4 sum = {0.0f, 0.0f, 0.0f, 0.0f}; + + // Iterator for the filter + int filterIdx = 0; + + // Each work-item iterates around its local area based on the + // size of the filter + int2 coords; // Coordinates for accessing the image + // Iterate the filter rows + for(int i = -halfWidth; i <= halfWidth; i++) { + coords.y = row + i; + + // Iterate over the filter columns + for(int j = -halfWidth; j <= halfWidth; j++) { + coords.x = column + j; + + float4 pixel; + // Read a pixel from the image. A single channel image + // stores the pixel in the 'x' coordinate of the returned + // vector. + pixel = read_imagef(sourceImage, sampler, coords); + sum.x += pixel.x * filter[filterIdx++]; + } + } + + // Copy the data to the output image if the + // work-item is in bounds + if(row < rows && column < cols) { + coords.x = column; + coords.y = row; + write_imagef(outputImage, coords, sum); + } +} \ No newline at end of file diff --git a/benchmarks/new_opencl/convolution/main.cpp b/benchmarks/new_opencl/convolution/main.cpp new file mode 100755 index 00000000..f5bf1584 --- /dev/null +++ b/benchmarks/new_opencl/convolution/main.cpp @@ -0,0 +1,261 @@ +#include +#include +#include + +#include "utils.h" + +// This function takes a positive integer and rounds it up to +// the nearest multiple of another provided integer +unsigned int roundUp(unsigned int value, unsigned int multiple) { + + // Determine how far past the nearest multiple the value is + unsigned int remainder = value % multiple; + + // Add the difference to make the value a multiple + if(remainder != 0) { + value += (multiple-remainder); + } + + return value; +} + +// This function reads in a text file and stores it as a char pointer +char* readSource(char* kernelPath) { + + cl_int status; + FILE *fp; + char *source; + long int size; + + printf("Program file is: %s\n", kernelPath); + + fp = fopen(kernelPath, "rb"); + if(!fp) { + printf("Could not open kernel file\n"); + exit(-1); + } + status = fseek(fp, 0, SEEK_END); + if(status != 0) { + printf("Error seeking to end of file\n"); + exit(-1); + } + size = ftell(fp); + if(size < 0) { + printf("Error getting file position\n"); + exit(-1); + } + + rewind(fp); + + source = (char *)malloc(size + 1); + + int i; + for (i = 0; i < size+1; i++) { + source[i]='\0'; + } + + if(source == NULL) { + printf("Error allocating space for the kernel source\n"); + exit(-1); + } + + fread(source, 1, size, fp); + source[size] = '\0'; + + return source; +} + +void chk(cl_int status, const char* cmd) { + + if(status != CL_SUCCESS) { + printf("%s failed (%d)\n", cmd, status); + exit(-1); + } +} + +int main() { + + int i, j, k, l; + + // Rows and columns in the input image + int imageHeight; + int imageWidth; + + const char* inputFile = "input.bmp"; + const char* outputFile = "output.bmp"; + + // Homegrown function to read a BMP from file + float* inputImage = readImage(inputFile, &imageWidth, + &imageHeight); + + // Size of the input and output images on the host + int dataSize = imageHeight*imageWidth*sizeof(float); + + // Output image on the host + float* outputImage = NULL; + outputImage = (float*)malloc(dataSize); + float* refImage = NULL; + refImage = (float*)malloc(dataSize); + + // 45 degree motion blur + float filter[49] = + {0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + 0, 0, -1, 0, 1, 0, 0, + 0, 0, -2, 0, 2, 0, 0, + 0, 0, -1, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0}; + + // The convolution filter is 7x7 + int filterWidth = 7; + int filterSize = filterWidth*filterWidth; // Assume a square kernel + + // Set up the OpenCL environment + cl_int status; + + // Discovery platform + cl_platform_id platform; + status = clGetPlatformIDs(1, &platform, NULL); + chk(status, "clGetPlatformIDs"); + + // Discover device + cl_device_id device; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); + chk(status, "clGetDeviceIDs"); + + // Create context + cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)(platform), 0}; + cl_context context; + context = clCreateContext(props, 1, &device, NULL, NULL, &status); + chk(status, "clCreateContext"); + + // Create command queue + cl_command_queue queue; + queue = clCreateCommandQueue(context, device, 0, &status); + chk(status, "clCreateCommandQueue"); + + // The image format describes how the data will be stored in memory + cl_image_format format; + format.image_channel_order = CL_R; // single channel + format.image_channel_data_type = CL_FLOAT; // float data type + + // Create space for the source image on the device + cl_mem d_inputImage = clCreateImage2D(context, 0, &format, imageWidth, + imageHeight, 0, NULL, &status); + chk(status, "clCreateImage2D"); + + // Create space for the output image on the device + cl_mem d_outputImage = clCreateImage2D(context, 0, &format, imageWidth, + imageHeight, 0, NULL, &status); + chk(status, "clCreateImage2D"); + + // Create space for the 7x7 filter on the device + cl_mem d_filter = clCreateBuffer(context, 0, filterSize*sizeof(float), + NULL, &status); + chk(status, "clCreateBuffer"); + + // Copy the source image to the device + size_t origin[3] = {0, 0, 0}; // Offset within the image to copy from + size_t region[3] = {imageWidth, imageHeight, 1}; // Elements to per dimension + status = clEnqueueWriteImage(queue, d_inputImage, CL_FALSE, origin, region, + 0, 0, inputImage, 0, NULL, NULL); + chk(status, "clEnqueueWriteImage"); + + // Copy the 7x7 filter to the device + status = clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0, + filterSize*sizeof(float), filter, 0, NULL, NULL); + chk(status, "clEnqueueWriteBuffer"); + + // Create the image sampler + cl_sampler sampler = clCreateSampler(context, CL_FALSE, + CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status); + chk(status, "clCreateSampler"); + + const char* source = readSource("kernel.cl"); + + // Create a program object with source and build it + cl_program program; + program = clCreateProgramWithSource(context, 1, &source, NULL, NULL); + chk(status, "clCreateProgramWithSource"); + status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); + chk(status, "clBuildProgram"); + + // Create the kernel object + cl_kernel kernel; + kernel = clCreateKernel(program, "convolution", &status); + chk(status, "clCreateKernel"); + + // Set the kernel arguments + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage); + status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage); + status |= clSetKernelArg(kernel, 2, sizeof(int), &imageHeight); + status |= clSetKernelArg(kernel, 3, sizeof(int), &imageWidth); + status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter); + status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth); + status |= clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler); + chk(status, "clSetKernelArg"); + + // Set the work item dimensions + size_t globalSize[2] = {imageWidth, imageHeight}; + status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0, + NULL, NULL); + chk(status, "clEnqueueNDRange"); + + // Read the image back to the host + status = clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin, + region, 0, 0, outputImage, 0, NULL, NULL); + chk(status, "clEnqueueReadImage"); + + // Write the output image to file + storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile); + + // Compute the reference image + for(i = 0; i < imageHeight; i++) { + for(j = 0; j < imageWidth; j++) { + refImage[i*imageWidth+j] = 0; + } + } + + // Iterate over the rows of the source image + int halfFilterWidth = filterWidth/2; + float sum; + for(i = 0; i < imageHeight; i++) { + // Iterate over the columns of the source image + for(j = 0; j < imageWidth; j++) { + sum = 0; // Reset sum for new source pixel + // Apply the filter to the neighborhood + for(k = - halfFilterWidth; k <= halfFilterWidth; k++) { + for(l = - halfFilterWidth; l <= halfFilterWidth; l++) { + if(i+k >= 0 && i+k < imageHeight && + j+l >= 0 && j+l < imageWidth) { + sum += inputImage[(i+k)*imageWidth + j+l] * + filter[(k+halfFilterWidth)*filterWidth + + l+halfFilterWidth]; + } + } + } + refImage[i*imageWidth+j] = sum; + } + } + + int failed = 0; + for(i = 0; i < imageHeight; i++) { + for(j = 0; j < imageWidth; j++) { + if(abs(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j]) > 0.01) { + printf("Results are INCORRECT\n"); + printf("Pixel mismatch at <%d,%d> (%f vs. %f)\n", i, j, + outputImage[i*imageWidth+j], refImage[i*imageWidth+j]); + failed = 1; + } + if(failed) break; + } + if(failed) break; + } + if(!failed) { + printf("Results are correct\n"); + } + + return 0; +} \ No newline at end of file diff --git a/benchmarks/new_opencl/convolution/utils.cpp b/benchmarks/new_opencl/convolution/utils.cpp new file mode 100644 index 00000000..74ca6dad --- /dev/null +++ b/benchmarks/new_opencl/convolution/utils.cpp @@ -0,0 +1,180 @@ +#include +#include + +#include "utils.h" + +void storeImage(float *imageOut, + const char *filename, + int rows, + int cols, + const char* refFilename) { + + FILE *ifp, *ofp; + unsigned char tmp; + int offset; + unsigned char *buffer; + int i, j; + + int bytes; + + int height, width; + + ifp = fopen(refFilename, "rb"); + if(ifp == NULL) { + perror(filename); + exit(-1); + } + + fseek(ifp, 10, SEEK_SET); + fread(&offset, 4, 1, ifp); + + fseek(ifp, 18, SEEK_SET); + fread(&width, 4, 1, ifp); + fread(&height, 4, 1, ifp); + + fseek(ifp, 0, SEEK_SET); + + buffer = (unsigned char *)malloc(offset); + if(buffer == NULL) { + perror("malloc"); + exit(-1); + } + + fread(buffer, 1, offset, ifp); + + printf("Writing output image to %s\n", filename); + ofp = fopen(filename, "wb"); + if(ofp == NULL) { + perror("opening output file"); + exit(-1); + } + bytes = fwrite(buffer, 1, offset, ofp); + if(bytes != offset) { + printf("error writing header!\n"); + exit(-1); + } + + // NOTE bmp formats store data in reverse raster order (see comment in + // readImage function), so we need to flip it upside down here. + int mod = width % 4; + if(mod != 0) { + mod = 4 - mod; + } + // printf("mod = %d\n", mod); + for(i = height-1; i >= 0; i--) { + for(j = 0; j < width; j++) { + tmp = (unsigned char)imageOut[i*cols+j]; + fwrite(&tmp, sizeof(char), 1, ofp); + } + // In bmp format, rows must be a multiple of 4-bytes. + // So if we're not at a multiple of 4, add junk padding. + for(j = 0; j < mod; j++) { + fwrite(&tmp, sizeof(char), 1, ofp); + } + } + + fclose(ofp); + fclose(ifp); + + free(buffer); +} + +/* + * Read bmp image and convert to byte array. Also output the width and height + */ +float* readImage(const char *filename, int* widthOut, int* heightOut) { + + uchar* imageData; + + int height, width; + uchar tmp; + int offset; + int i, j; + + printf("Reading input image from %s\n", filename); + FILE *fp = fopen(filename, "rb"); + if(fp == NULL) { + perror(filename); + exit(-1); + } + + fseek(fp, 10, SEEK_SET); + fread(&offset, 4, 1, fp); + + fseek(fp, 18, SEEK_SET); + fread(&width, 4, 1, fp); + fread(&height, 4, 1, fp); + + printf("width = %d\n", width); + printf("height = %d\n", height); + + *widthOut = width; + *heightOut = height; + + imageData = (uchar*)malloc(width*height); + if(imageData == NULL) { + perror("malloc"); + exit(-1); + } + + fseek(fp, offset, SEEK_SET); + fflush(NULL); + + int mod = width % 4; + if(mod != 0) { + mod = 4 - mod; + } + + // NOTE bitmaps are stored in upside-down raster order. So we begin + // reading from the bottom left pixel, then going from left-to-right, + // read from the bottom to the top of the image. For image analysis, + // we want the image to be right-side up, so we'll modify it here. + + // First we read the image in upside-down + + // Read in the actual image + for(i = 0; i < height; i++) { + + // add actual data to the image + for(j = 0; j < width; j++) { + fread(&tmp, sizeof(char), 1, fp); + imageData[i*width + j] = tmp; + } + // For the bmp format, each row has to be a multiple of 4, + // so I need to read in the junk data and throw it away + for(j = 0; j < mod; j++) { + fread(&tmp, sizeof(char), 1, fp); + } + } + + // Then we flip it over + int flipRow; + for(i = 0; i < height/2; i++) { + flipRow = height - (i+1); + for(j = 0; j < width; j++) { + tmp = imageData[i*width+j]; + imageData[i*width+j] = imageData[flipRow*width+j]; + imageData[flipRow*width+j] = tmp; + } + } + + fclose(fp); + + // Input image on the host + float* floatImage = NULL; + floatImage = (float*)malloc(sizeof(float)*width*height); + if(floatImage == NULL) { + perror("malloc"); + exit(-1); + } + + // Convert the BMP image to float (not required) + for(i = 0; i < height; i++) { + for(j = 0; j < width; j++) { + floatImage[i*width+j] = (float)imageData[i*width+j]; + } + } + + free(imageData); + return floatImage; +} \ No newline at end of file diff --git a/benchmarks/new_opencl/convolution/utils.h b/benchmarks/new_opencl/convolution/utils.h new file mode 100644 index 00000000..2686de50 --- /dev/null +++ b/benchmarks/new_opencl/convolution/utils.h @@ -0,0 +1,11 @@ +#ifndef __UTILS__ +#define __UTILS__ + +typedef unsigned char uchar; + +float* readImage(const char *filename, int* widthOut, int* heightOut); + +void storeImage(float *imageOut, const char *filename, int rows, int cols, + const char* refFilename); + +#endif \ No newline at end of file diff --git a/benchmarks/new_opencl/transpose/.gitignore b/benchmarks/new_opencl/transpose/.gitignore new file mode 100644 index 00000000..dd07f846 --- /dev/null +++ b/benchmarks/new_opencl/transpose/.gitignore @@ -0,0 +1 @@ +transpose \ No newline at end of file diff --git a/benchmarks/new_opencl/transpose/Makefile b/benchmarks/new_opencl/transpose/Makefile new file mode 100644 index 00000000..4092ed9f --- /dev/null +++ b/benchmarks/new_opencl/transpose/Makefile @@ -0,0 +1,44 @@ +RISCV_TOOL_PATH ?= $(wildcard ~/dev/riscv-gnu-toolchain/drops) +POCLCC_PATH ?= $(wildcard ~/dev/pocl/drops_vortex_cc) +POCLRT_PATH ?= $(wildcard ..) +DRIVER_PATH ?= $(wildcard ../../../driver/sw) + +CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-errors + +CXXFLAGS += -I$(POCLRT_PATH)/include + +LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex + +PROJECT = transpose + +SRCS = main.cc transpose_gold.cpp + +all: $(PROJECT) + +kernel.pocl: kernel.cl + POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCLCC_PATH)/lib:$(DRIVER_PATH)/simx $(POCLCC_PATH)/bin/poclcc -o kernel.pocl kernel.cl + +$(PROJECT): $(SRCS) + $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ + +run-fpga: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) + +run-ase: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) + +run-simx: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) + +run-rtlsim: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) + +.depend: $(SRCS) + $(CXX) $(CXXFLAGS) -MM $^ > .depend; + +clean: + rm -rf $(PROJECT) *.o *.dump .depend + +ifneq ($(MAKECMDGOALS),clean) + -include .depend +endif \ No newline at end of file diff --git a/benchmarks/new_opencl/transpose/main.cc b/benchmarks/new_opencl/transpose/main.cc new file mode 100644 index 00000000..f72cb851 --- /dev/null +++ b/benchmarks/new_opencl/transpose/main.cc @@ -0,0 +1,387 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/* Matrix transpose with Cuda + * Host code. + + * This example transposes arbitrary-size matrices. It compares a naive + * transpose kernel that suffers from non-coalesced writes, to an optimized + * transpose with fully coalesced memory access and no bank conflicts. On + * a G80 GPU, the optimized transpose can be more than 10x faster for large + * matrices. + */ + +// standard utility and system includes +#include "oclUtils.h" +#include "shrQATest.h" + +#define BLOCK_DIM 16 + +// max GPU's to manage for multi-GPU parallel compute +const unsigned int MAX_GPU_COUNT = 8; + +// global variables +cl_platform_id cpPlatform; +cl_uint uiNumDevices; +cl_device_id* cdDevices; +cl_context cxGPUContext; +cl_kernel ckKernel[MAX_GPU_COUNT]; +cl_command_queue commandQueue[MAX_GPU_COUNT]; +cl_program rv_program; + +// forward declarations +// ********************************************************************* +int runTest( int argc, const char** argv); +extern "C" void computeGold( float* reference, float* idata, + const unsigned int size_x, const unsigned int size_y ); + +// Main Program +// ********************************************************************* +int main( int argc, const char** argv) +{ + shrQAStart(argc, (char **)argv); + + // set logfile name and start logs + shrSetLogFileName ("oclTranspose.txt"); + shrLog("%s Starting...\n\n", argv[0]); + + // run the main test + int result = runTest(argc, argv); + //oclCheckError(result, 0); +} + +double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceCount, float* h_idata, float* h_odata, unsigned int size_x, unsigned int size_y) +{ + cl_mem d_odata[MAX_GPU_COUNT]; + cl_mem d_idata[MAX_GPU_COUNT]; + cl_kernel ckKernel[MAX_GPU_COUNT]; + + size_t szGlobalWorkSize[2]; + size_t szLocalWorkSize[2]; + cl_int ciErrNum; + + // Create buffers for each GPU + // Each GPU will compute sizePerGPU rows of the result + size_t sizePerGPU = shrRoundUp(BLOCK_DIM, (size_x+ciDeviceCount-1) / ciDeviceCount); + + // size of memory required to store the matrix + const size_t mem_size = sizeof(float) * size_x * size_y; + + for(unsigned int i = 0; i < ciDeviceCount; ++i){ + // allocate device memory and copy host to device memory + d_idata[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + mem_size, h_idata, &ciErrNum); + //oclCheckError(ciErrNum, CL_SUCCESS); + + // create buffer to store output + d_odata[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY , + sizePerGPU*size_y*sizeof(float), NULL, &ciErrNum); + //oclCheckError(ciErrNum, CL_SUCCESS); + + // create the naive transpose kernel + ckKernel[i] = clCreateKernel(rv_program, kernelName, &ciErrNum); + //oclCheckError(ciErrNum, CL_SUCCESS); + + // set the args values for the naive kernel + size_t offset = i * sizePerGPU; + ciErrNum = clSetKernelArg(ckKernel[i], 0, sizeof(cl_mem), (void *) &d_odata[i]); + ciErrNum |= clSetKernelArg(ckKernel[i], 1, sizeof(cl_mem), (void *) &d_idata[0]); + ciErrNum |= clSetKernelArg(ckKernel[i], 2, sizeof(int), &offset); + ciErrNum |= clSetKernelArg(ckKernel[i], 3, sizeof(int), &size_x); + ciErrNum |= clSetKernelArg(ckKernel[i], 4, sizeof(int), &size_y); + if(useLocalMem) + { + ciErrNum |= clSetKernelArg(ckKernel[i], 5, (BLOCK_DIM + 1) * BLOCK_DIM * sizeof(float), 0 ); + } + } + //oclCheckError(ciErrNum, CL_SUCCESS); + + // set up execution configuration + szLocalWorkSize[0] = BLOCK_DIM; + szLocalWorkSize[1] = BLOCK_DIM; + szGlobalWorkSize[0] = sizePerGPU; + szGlobalWorkSize[1] = shrRoundUp(BLOCK_DIM, size_y); + + // execute the kernel numIterations times + int numIterations = 100; + shrLog("\nProcessing a %d by %d matrix of floats...\n\n", size_x, size_y); + for (int i = -1; i < numIterations; ++i) + { + // Start time measurement after warmup + if( i == 0 ) shrDeltaT(0); + + for(unsigned int k=0; k < ciDeviceCount; ++k){ + ciErrNum |= clEnqueueNDRangeKernel(commandQueue[k], ckKernel[k], 2, NULL, + szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); + } + //oclCheckError(ciErrNum, CL_SUCCESS); + } + + // Block CPU till GPU is done + for(unsigned int k=0; k < ciDeviceCount; ++k){ + ciErrNum |= clFinish(commandQueue[k]); + } + double time = shrDeltaT(0)/(double)numIterations; + //oclCheckError(ciErrNum, CL_SUCCESS); + + // Copy back to host + for(unsigned int i = 0; i < ciDeviceCount; ++i){ + size_t offset = i * sizePerGPU; + size_t size = MIN(size_x - i * sizePerGPU, sizePerGPU); + + ciErrNum |= clEnqueueReadBuffer(commandQueue[i], d_odata[i], CL_TRUE, 0, + size * size_y * sizeof(float), &h_odata[offset * size_y], + 0, NULL, NULL); + } + //oclCheckError(ciErrNum, CL_SUCCESS); + + for(unsigned int i = 0; i < ciDeviceCount; ++i){ + ciErrNum |= clReleaseMemObject(d_idata[i]); + ciErrNum |= clReleaseMemObject(d_odata[i]); + ciErrNum |= clReleaseKernel(ckKernel[i]); + } + //oclCheckError(ciErrNum, CL_SUCCESS); + + return time; +} +uint8_t *kernel_bin = NULL; + +static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) { + if (nullptr == filename || nullptr == data || 0 == size) + return -1; + + FILE* fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + fseek(fp , 0 , SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t*)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} +//! Run a simple test for CUDA +// ********************************************************************* +int runTest( const int argc, const char** argv) +{ + cl_int ciErrNum; + cl_uint ciDeviceCount; + unsigned int size_x = 2048; + unsigned int size_y = 2048; + + int temp; + if( shrGetCmdLineArgumenti( argc, argv,"width", &temp) ){ + size_x = temp; + } + + if( shrGetCmdLineArgumenti( argc, argv,"height", &temp) ){ + size_y = temp; + } + + // size of memory required to store the matrix + const size_t mem_size = sizeof(float) * size_x * size_y; + + //Get the NVIDIA platform + ciErrNum = oclGetPlatformID(&cpPlatform); + //oclCheckError(ciErrNum, CL_SUCCESS); + + //Get the devices + ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices); + //oclCheckError(ciErrNum, CL_SUCCESS); + cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); + ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL); + //oclCheckError(ciErrNum, CL_SUCCESS); + + //Create the context + cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum); + //oclCheckError(ciErrNum, CL_SUCCESS); + + if(shrCheckCmdLineFlag(argc, (const char**)argv, "device")) + { + ciDeviceCount = 0; + // User specified GPUs + char* deviceList; + char* deviceStr; + + shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList); + + #ifdef WIN32 + char* next_token; + deviceStr = strtok_s (deviceList," ,.-", &next_token); + #else + deviceStr = strtok (deviceList," ,.-"); + #endif + ciDeviceCount = 0; + while(deviceStr != NULL) + { + // get and print the device for this queue + cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr)); + if( device == (cl_device_id)-1 ) { + shrLog(" Invalid Device: %s\n\n", deviceStr); + return -1; + } + + shrLog("Device %d: ", atoi(deviceStr)); + oclPrintDevName(LOGBOTH, device); + shrLog("\n"); + + // create command queue + commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); + if (ciErrNum != CL_SUCCESS) + { + shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum); + return ciErrNum; + } + + ++ciDeviceCount; + + #ifdef WIN32 + deviceStr = strtok_s (NULL," ,.-", &next_token); + #else + deviceStr = strtok (NULL," ,.-"); + #endif + } + + free(deviceList); + } + else + { + // Find out how many GPU's to compute on all available GPUs + size_t nDeviceBytes; + ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); + ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id); + + if (ciErrNum != CL_SUCCESS) + { + shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum); + return ciErrNum; + } + else if (ciDeviceCount == 0) + { + shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum); + return -1; + } + + // create command-queues + for(unsigned int i = 0; i < ciDeviceCount; ++i) + { + // get and print the device for this queue + cl_device_id device = oclGetDev(cxGPUContext, i); + shrLog("Device %d: ", i); + oclPrintDevName(LOGBOTH, device); + shrLog("\n"); + + // create command queue + commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); + if (ciErrNum != CL_SUCCESS) + { + shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum); + return ciErrNum; + } + } + } + + // allocate and initalize host memory + float* h_idata = (float*)malloc(mem_size); + float* h_odata = (float*) malloc(mem_size); + srand(15235911); + shrFillArray(h_idata, (size_x * size_y)); + + // Program Setup + size_t program_length; + char* source_path = shrFindFilePath("transpose.cl", argv[0]); + //oclCheckError(source_path != NULL, shrTRUE); + char *source = oclLoadProgSource(source_path, "", &program_length); + //oclCheckError(source != NULL, shrTRUE); + size_t kernel_size; + cl_int binary_status = 0; + cl_device_id device_id; + // create the program + rv_program = clCreateProgramWithBinary(cxGPUContext, 1, &device_id, &kernel_size, &kernel_bin, &binary_status, NULL); + //rv_program = clCreateProgramWithSource(cxGPUContext, 1, + // (const char **)&source, &program_length, &ciErrNum); + //oclCheckError(ciErrNum, CL_SUCCESS); + + // build the program + ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); + if (ciErrNum != CL_SUCCESS) + { + // write out standard error, Build Log and PTX, then return error + shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); + oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext)); + oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclTranspose.ptx"); + return(EXIT_FAILURE); + } + + // Run Naive Kernel +#ifdef GPU_PROFILING + // Matrix Copy kernel runs to measure reference performance. + double uncoalescedCopyTime = transposeGPU("uncoalesced_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y); + double simpleCopyTime = transposeGPU("simple_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y); + double sharedCopyTime = transposeGPU("shared_copy", true, ciDeviceCount, h_idata, h_odata, size_x, size_y); +#endif + + double naiveTime = transposeGPU("transpose_naive", false, ciDeviceCount, h_idata, h_odata, size_x, size_y); + double optimizedTime = transposeGPU("transpose", true, ciDeviceCount, h_idata, h_odata, size_x, size_y); + +#ifdef GPU_PROFILING + // log times + + shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-simple copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", + (1.0e-9 * double(size_x * size_y * sizeof(float))/simpleCopyTime), simpleCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); + + shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-shared memory copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", + (1.0e-9 * double(size_x * size_y * sizeof(float))/sharedCopyTime), sharedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); + + shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-uncoalesced copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", + (1.0e-9 * double(size_x * size_y * sizeof(float))/uncoalescedCopyTime), uncoalescedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); + + shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-naive, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", + (1.0e-9 * double(size_x * size_y * sizeof(float))/naiveTime), naiveTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); + + shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-optimized, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", + (1.0e-9 * double(size_x * size_y * sizeof(float))/optimizedTime), optimizedTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); + +#endif + + // compute reference solution and cross check results + float* reference = (float*)malloc( mem_size); + computeGold( reference, h_idata, size_x, size_y); + shrLog("\nComparing results with CPU computation... \n\n"); + shrBOOL res = shrComparef( reference, h_odata, size_x * size_y); + + // cleanup memory + free(h_idata); + free(h_odata); + free(reference); + free(source); + free(source_path); + + // cleanup OpenCL + ciErrNum = clReleaseProgram(rv_program); + for(unsigned int i = 0; i < ciDeviceCount; ++i) + { + ciErrNum |= clReleaseCommandQueue(commandQueue[i]); + } + ciErrNum |= clReleaseContext(cxGPUContext); + //oclCheckError(ciErrNum, CL_SUCCESS); + + // pass or fail (cumulative... all tests in the loop) + shrQAFinishExit(argc, (const char **)argv, (1 == res) ? QA_PASSED : QA_FAILED); + + return 0; +} diff --git a/benchmarks/new_opencl/transpose/oclUtils.h b/benchmarks/new_opencl/transpose/oclUtils.h new file mode 100644 index 00000000..096612a8 --- /dev/null +++ b/benchmarks/new_opencl/transpose/oclUtils.h @@ -0,0 +1,198 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +#ifndef OCL_UTILS_H +#define OCL_UTILS_H + +// ********************************************************************* +// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK +// ********************************************************************* + +// Common headers: Cross-API utililties and OpenCL header +#include "shrUtils.h" + +// All OpenCL headers +#if defined (__APPLE__) || defined(MACOSX) + #include +#else + #include +#endif + +// Includes +#include +#include +#include + +// For systems with CL_EXT that are not updated with these extensions, we copied these +// extensions from +#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV + /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */ + #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 + #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 + #define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002 + #define CL_DEVICE_WARP_SIZE_NV 0x4003 + #define CL_DEVICE_GPU_OVERLAP_NV 0x4004 + #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 + #define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 +#endif + +// reminders for build output window and log +#ifdef _WIN32 + #pragma message ("Note: including shrUtils.h") + #pragma message ("Note: including opencl.h") +#endif + +// SDK Revision # +#define OCL_SDKREVISION "7027912" + +// Error and Exit Handling Macros... +// ********************************************************************* +// Full error handling macro with Cleanup() callback (if supplied)... +// (Companion Inline Function lower on page) +#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__) + +// Short version without Cleanup() callback pointer +// Both Input (a) and Reference (b) are specified as args +#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0) + +////////////////////////////////////////////////////////////////////////////// +//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0 +//! +//! @return the id +//! @param clSelectedPlatformID OpenCL platform ID +////////////////////////////////////////////////////////////////////////////// +extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID); + +////////////////////////////////////////////////////////////////////////////// +//! Print info about the device +//! +//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE +//! @param device OpenCL id of the device +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device); + +////////////////////////////////////////////////////////////////////////////// +//! Get and return device capability +//! +//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA +//! @param device OpenCL id of the device +////////////////////////////////////////////////////////////////////////////// +extern "C" int oclGetDevCap(cl_device_id device); + +////////////////////////////////////////////////////////////////////////////// +//! Print the device name +//! +//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE +//! @param device OpenCL id of the device +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclPrintDevName(int iLogMode, cl_device_id device); + +////////////////////////////////////////////////////////////////////////////// +//! Gets the id of the first device from the context +//! +//! @return the id +//! @param cxGPUContext OpenCL context +////////////////////////////////////////////////////////////////////////////// +extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext); + +////////////////////////////////////////////////////////////////////////////// +//! Gets the id of the nth device from the context +//! +//! @return the id or -1 when out of range +//! @param cxGPUContext OpenCL context +//! @param device_idx index of the device of interest +////////////////////////////////////////////////////////////////////////////// +extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx); + +////////////////////////////////////////////////////////////////////////////// +//! Gets the id of device with maximal FLOPS from the context +//! +//! @return the id +//! @param cxGPUContext OpenCL context +////////////////////////////////////////////////////////////////////////////// +extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext); + +////////////////////////////////////////////////////////////////////////////// +//! Loads a Program file and prepends the cPreamble to the code. +//! +//! @return the source string if succeeded, 0 otherwise +//! @param cFilename program filename +//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header +//! @param szFinalLength returned length of the code string +////////////////////////////////////////////////////////////////////////////// +extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength); + +////////////////////////////////////////////////////////////////////////////// +//! Get the binary (PTX) of the program associated with the device +//! +//! @param cpProgram OpenCL program +//! @param cdDevice device of interest +//! @param binary returned code +//! @param length length of returned code +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length); + +////////////////////////////////////////////////////////////////////////////// +//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device +//! +//! @param cpProgram OpenCL program +//! @param cdDevice device of interest +//! @param const char* cPtxFileName optional PTX file name +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName); + +////////////////////////////////////////////////////////////////////////////// +//! Get and log the Build Log from the OpenCL compiler for the requested program & device +//! +//! @param cpProgram OpenCL program +//! @param cdDevice device of interest +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice); + +// Helper function for De-allocating cl objects +// ********************************************************************* +extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs); + +// Helper function to get OpenCL error string from constant +// ********************************************************************* +extern "C" const char* oclErrorString(cl_int error); + +// Helper function to get OpenCL image format string (channel order and type) from constant +// ********************************************************************* +extern "C" const char* oclImageFormatString(cl_uint uiImageFormat); + +// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied) +// ********************************************************************* +inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine) +{ + // An error condition is defined by the sample/test value not equal to the reference + if (iReference != iSample) + { + // If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value + iSample = (iSample == 0) ? -9999 : iSample; + + // Log the error info + shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile); + + // Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code. + if (pCleanup != NULL) + { + pCleanup(iSample); + } + else + { + shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n"); + exit(iSample); + } + } +} + +#endif \ No newline at end of file diff --git a/benchmarks/new_opencl/transpose/shrQATest.h b/benchmarks/new_opencl/transpose/shrQATest.h new file mode 100644 index 00000000..93d2d9eb --- /dev/null +++ b/benchmarks/new_opencl/transpose/shrQATest.h @@ -0,0 +1,238 @@ +/* +* Copyright 1993-2010 NVIDIA Corporation. All rights reserved. +* +* Please refer to the NVIDIA end user license agreement (EULA) associated +* with this source code for terms and conditions that govern your use of +* this software. Any use, reproduction, disclosure, or distribution of +* this software and related documentation outside the terms of the EULA +* is strictly prohibited. +* +*/ + +#ifndef SHR_QATEST_H +#define SHR_QATEST_H + +// ********************************************************************* +// Generic utilities for NVIDIA GPU Computing SDK +// ********************************************************************* + +// OS dependent includes +#ifdef _WIN32 + #pragma message ("Note: including windows.h") + #pragma message ("Note: including math.h") + #pragma message ("Note: including assert.h") + #pragma message ("Note: including time.h") + +// Headers needed for Windows + #include + #include +#else + // Headers needed for Linux + #include + #include + #include + #include + #include + #include + #include + #include + #include +#endif + +#ifndef STRCASECMP +#ifdef _WIN32 +#define STRCASECMP _stricmp +#else +#define STRCASECMP strcasecmp +#endif +#endif + +#ifndef STRNCASECMP +#ifdef _WIN32 +#define STRNCASECMP _strnicmp +#else +#define STRNCASECMP strncasecmp +#endif +#endif + + +// Standardized QA Start/Finish for CUDA SDK tests +#define shrQAStart(a, b) __shrQAStart(a, b) +#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c) +#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d) + +inline int findExeNameStart(const char *exec_name) +{ + int exename_start = (int)strlen(exec_name); + + while( (exename_start > 0) && + (exec_name[exename_start] != '\\') && + (exec_name[exename_start] != '/') ) + { + exename_start--; + } + if (exec_name[exename_start] == '\\' || + exec_name[exename_start] == '/') + { + return exename_start+1; + } else { + return exename_start; + } +} + +inline int __shrQAStart(int argc, char **argv) +{ + bool bQATest = false; + // First clear the output buffer + fflush(stdout); + fflush(stdout); + + for (int i=1; i < argc; i++) { + int string_start = 0; + while (argv[i][string_start] == '-') + string_start++; + char *string_argv = &argv[i][string_start]; + + if (!STRCASECMP(string_argv, "qatest")) { + bQATest = true; + } + } + + // We don't want to print the entire path, so we search for the first + int exename_start = findExeNameStart(argv[0]); + if (bQATest) { + fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start])); + for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]); + fprintf(stdout, "\n"); + } else { + fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start])); + } + fflush(stdout); + printf("\n"); fflush(stdout); + return exename_start; +} + +enum eQAstatus { + QA_FAILED = 0, + QA_PASSED = 1, + QA_WAIVED = 2 +}; + +inline void __ExitInTime(int seconds) +{ + fprintf(stdout, "> exiting in %d seconds: ", seconds); + fflush(stdout); + time_t t; + int count; + for (t=time(0)+seconds, count=seconds; time(0) < t; count--) { + fprintf(stdout, "%d...", count); +#ifdef WIN32 + Sleep(1000); +#else + sleep(1); +#endif + } + fprintf(stdout,"done!\n\n"); + fflush(stdout); +} + + +inline void __shrQAFinish(int argc, const char **argv, int iStatus) +{ + // By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() ) + bool bQATest = false, bNoPrompt = true, bQuitInTime = true; + const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL }; + + for (int i=1; i < argc; i++) { + int string_start = 0; + while (argv[i][string_start] == '-') + string_start++; + + const char *string_argv = &argv[i][string_start]; + if (!STRCASECMP(string_argv, "qatest")) { + bQATest = true; + } + // For SDK individual samples that don't specify -noprompt or -prompt, + // a 3 second delay will happen before exiting, giving a user time to view results + if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) { + bNoPrompt = true; + bQuitInTime = false; + } + if (!STRCASECMP(string_argv, "prompt")) { + bNoPrompt = false; + bQuitInTime = false; + } + } + + int exename_start = findExeNameStart(argv[0]); + if (bQATest) { + fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start])); + for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]); + fprintf(stdout, "\n"); + } else { + fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]); + } + fflush(stdout); + printf("\n"); fflush(stdout); + if (bQuitInTime) { + __ExitInTime(3); + } else { + if (!bNoPrompt) { + fprintf(stdout, "\nPress to exit...\n"); + fflush(stdout); + getchar(); + } + } +} + +inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus) +{ + bool bQuitInTime = true; + const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL }; + + for (int i=1; i < argc; i++) { + int string_start = 0; + while (argv[i][string_start] == '-') + string_start++; + + const char *string_argv = &argv[i][string_start]; + // For SDK individual samples that don't specify -noprompt or -prompt, + // a 3 second delay will happen before exiting, giving a user time to view results + if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) { + bQuitInTime = false; + } + if (!STRCASECMP(string_argv, "prompt")) { + bQuitInTime = false; + } + } + + int exename_start = findExeNameStart(argv[0]); + if (bQATest) { + fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start])); + for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]); + fprintf(stdout, "\n"); + } else { + fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]); + } + fflush(stdout); + + if (bQuitInTime) { + __ExitInTime(3); + } +} + +inline void shrQAFinishExit(int argc, const char **argv, int iStatus) +{ + __shrQAFinish(argc, argv, iStatus); + + exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); +} + +inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus) +{ + __shrQAFinish2(bQAtest, argc, argv, iStatus); + + exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); +} + +#endif \ No newline at end of file diff --git a/benchmarks/new_opencl/transpose/shrUtils.h b/benchmarks/new_opencl/transpose/shrUtils.h new file mode 100644 index 00000000..45ace670 --- /dev/null +++ b/benchmarks/new_opencl/transpose/shrUtils.h @@ -0,0 +1,642 @@ +/* +* Copyright 1993-2010 NVIDIA Corporation. All rights reserved. +* +* Please refer to the NVIDIA end user license agreement (EULA) associated +* with this source code for terms and conditions that govern your use of +* this software. Any use, reproduction, disclosure, or distribution of +* this software and related documentation outside the terms of the EULA +* is strictly prohibited. +* +*/ + +#ifndef SHR_UTILS_H +#define SHR_UTILS_H + +// ********************************************************************* +// Generic utilities for NVIDIA GPU Computing SDK +// ********************************************************************* + +// reminders for output window and build log +#ifdef _WIN32 + #pragma message ("Note: including windows.h") + #pragma message ("Note: including math.h") + #pragma message ("Note: including assert.h") +#endif + +// OS dependent includes +#ifdef _WIN32 + // Headers needed for Windows + #include +#else + // Headers needed for Linux + #include + #include + #include + #include + #include + #include + #include +#endif + +// Other headers needed for both Windows and Linux +#include +#include +#include +#include +#include + +// Un-comment the following #define to enable profiling code in SDK apps +//#define GPU_PROFILING + +// Beginning of GPU Architecture definitions +inline int ConvertSMVer2Cores(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = + { { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class + { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class + { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class + { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class + { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class + { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class + { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class + { -1, -1 } + }; + + int index = 0; + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) { + return nGpuArchCoresPerSM[index].Cores; + } + index++; + } + printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor); + return -1; +} +// end of GPU Architecture definitions + + +// Defines and enum for use with logging functions +// ********************************************************************* +#define DEFAULTLOGFILE "SdkConsoleLog.txt" +#define MASTERLOGFILE "SdkMasterLog.csv" +enum LOGMODES +{ + LOGCONSOLE = 1, // bit to signal "log to console" + LOGFILE = 2, // bit to signal "log to file" + LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both" + APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open + MASTER = 8, // bit to signal master .csv log output + ERRORMSG = 16, // bit to signal "pre-pend Error" + CLOSELOG = 32 // bit to close log file, if open, after any requested file write +}; +#define HDASHLINE "-----------------------------------------------------------\n" + +// Standardized boolean +enum shrBOOL +{ + shrFALSE = 0, + shrTRUE = 1 +}; + +// Standardized MAX, MIN and CLAMP +#define MAX(a, b) ((a > b) ? a : b) +#define MIN(a, b) ((a < b) ? a : b) +#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a +#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a + +// Error and Exit Handling Macros... +// ********************************************************************* +// Full error handling macro with Cleanup() callback (if supplied)... +// (Companion Inline Function lower on page) +#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__) + +// Short version without Cleanup() callback pointer +// Both Input (a) and Reference (b) are specified as args +#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0) + +// Standardized Exit Macro for leaving main()... extended version +// (Companion Inline Function lower on page) +#define shrExitEX(a, b, c) __shrExitEX(a, b, c) + +// Standardized Exit Macro for leaving main()... short version +// (Companion Inline Function lower on page) +#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS) + +// Simple argument checker macro +#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE + +// Define for user-customized error handling +#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__ + +// Function to deallocate memory allocated within shrUtils +// ********************************************************************* +extern "C" void shrFree(void* ptr); + +// ********************************************************************* +// Helper function to log standardized information to Console, to File or to both +//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n"); +//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); +//! +//! Automatically opens file and stores handle if needed and not done yet +//! Closes file and nulls handle on request +//! +//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG. +//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode +//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close" +//! First 3 options may be | 'd with MASTER to enable independent write to master data log file +//! First 3 options may be | 'd with ERRORMSG to start line with standard error message +//! @param 2 dValue: +//! Positive val = double value for time in secs to be formatted to 6 decimals. +//! Negative val is an error code and this give error preformatting. +//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf. +//! ALL printf flags, width, precision and type specifiers are supported with this exception: +//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported +//! Single byte char type specifiers (%s and %c) ARE supported +//! @param 4... variable args: like printf or fprintf. Must match format specifer type above. +//! @return 0 if OK, negative value on error or if error occurs or was passed in. +// ********************************************************************* +extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...); + +// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0, +// ********************************************************************* +extern "C" int shrLog(const char* cFormatString, ...); + +// ********************************************************************* +// Delta timer function for up to 3 independent timers using host high performance counters +// Maintains state for 3 independent counters +//! Example: double dElapsedTime = shrDeltaTime(0); +//! +//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2) +//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error +// ********************************************************************* +extern "C" double shrDeltaT(int iCounterID); + +// Optional LogFileNameOverride function +// ********************************************************************* +extern "C" void shrSetLogFileName (const char* cOverRideName); + +// Helper function to init data arrays +// ********************************************************************* +extern "C" void shrFillArray(float* pfData, int iSize); + +// Helper function to print data arrays +// ********************************************************************* +extern "C" void shrPrintArray(float* pfData, int iSize); + +//////////////////////////////////////////////////////////////////////////// +//! Find the path for a filename +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executablePath optional absolute path of the executable +//////////////////////////////////////////////////////////////////////////// +extern "C" char* shrFindFilePath(const char* filename, const char* executablePath); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing single precision floating point data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing double precision floating point data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing integer data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing unsigned integer data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data, + unsigned int* len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing char / byte data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing unsigned char / byte data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data, + unsigned int* len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing single precision floating point +//! data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len, + const float epsilon, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing double precision floating point +//! data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len, + const double epsilon, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing integer data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing unsigned integer data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data, + unsigned int len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing char / byte data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing unsigned char / byte data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data, + unsigned int len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Load PPM image file (with unsigned char as data element type), padding +//! 4th component +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param file name of the image file +//! @param OutData handle to the data read +//! @param w width of the image +//! @param h height of the image +//! +//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData, + unsigned int *w, unsigned int *h); + +//////////////////////////////////////////////////////////////////////////// +//! Save PPM image file (with unsigned char as data element type, padded to +//! 4 bytes) +//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data, + unsigned int w, unsigned int h); + +//////////////////////////////////////////////////////////////////////////////// +//! Save PGM image file (with unsigned char as data element type) +//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data, + unsigned int w, unsigned int h); + +//////////////////////////////////////////////////////////////////////////// +//! Load PGM image file (with unsigned char as data element type) +//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data, + unsigned int *w,unsigned int *h); + +//////////////////////////////////////////////////////////////////////////// +// Command line arguments: General notes +// * All command line arguments begin with '--' followed by the token; +// token and value are seperated by '='; example --samples=50 +// * Arrays have the form --model=[one.obj,two.obj,three.obj] +// (without whitespaces) +//////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////// +//! Check if command line argument \a flag-name is given +//! @return shrTRUE if command line argument \a flag_name has been given, +//! otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param flag_name name of command line flag +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv, + const char* flag_name); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type int +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv, + const char* arg_name, int* val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type unsigned int +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv, + const char* arg_name, unsigned int* val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type float +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv, + const char* arg_name, float* val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type string +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv, + const char* arg_name, char** val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument list those element are strings +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val command line argument list +//! @param len length of the list / number of elements +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv, + const char* arg_name, char** val, + unsigned int* len); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparef( const float* reference, const float* data, + const unsigned int len); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two integer arrays +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparei( const int* reference, const int* data, + const unsigned int len ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two unsigned integer arrays, with epsilon and threshold +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param threshold tolerance % # of comparison errors (0.15f = 15%) +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data, + const unsigned int len, const float epsilon, const float threshold ); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two unsigned char arrays +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data, + const unsigned int len ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two integers with a tolernance for # of byte errors +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//! @param threshold tolerance % # of comparison errors (0.15f = 15%) +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data, + const unsigned int len, const float epsilon, const float threshold ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two integer arrays witha n epsilon tolerance for equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data, + const unsigned int len, const float epsilon ); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays with an epsilon tolerance for equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparefe( const float* reference, const float* data, + const unsigned int len, const float epsilon ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays with an epsilon tolerance for equality and a +//! threshold for # pixel errors +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparefet( const float* reference, const float* data, + const unsigned int len, const float epsilon, const float threshold ); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays using L2-norm with an epsilon tolerance for +//! equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data, + const unsigned int len, const float epsilon ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two PPM image files with an epsilon tolerance for equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param src_file filename for the image to be compared +//! @param data filename for the reference data / gold image +//! @param epsilon epsilon to use for the comparison +//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass) +//! $param verboseErrors output details of image mismatch to std::err +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two PGM image files with an epsilon tolerance for equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param src_file filename for the image to be compared +//! @param data filename for the reference data / gold image +//! @param epsilon epsilon to use for the comparison +//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass) +//! $param verboseErrors output details of image mismatch to std::err +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold); + +extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size); + +extern "C" size_t shrRoundUp(int group_size, int global_size); + +// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied) +// ********************************************************************* +inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine) +{ + if (iReference != iSample) + { + shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile); + if (pCleanup != NULL) + { + pCleanup(EXIT_FAILURE); + } + else + { + shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n"); + exit(EXIT_FAILURE); + } + } +} + +// Standardized Exit +// ********************************************************************* +inline void __shrExitEX(int argc, const char** argv, int iExitCode) +{ +#ifdef WIN32 + if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) +#else + if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) +#endif + { + shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress to Quit...\n"); + getchar(); + } + else + { + shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]); + } + fflush(stderr); + exit(iExitCode); +} + +#endif \ No newline at end of file diff --git a/benchmarks/new_opencl/transpose/transpose.cl b/benchmarks/new_opencl/transpose/transpose.cl new file mode 100644 index 00000000..c0dd6e6b --- /dev/null +++ b/benchmarks/new_opencl/transpose/transpose.cl @@ -0,0 +1,108 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/* Matrix transpose with OpenCL +* Device code. +*/ + +#define BLOCK_DIM 16 + +// This kernel is optimized to ensure all global reads and writes are coalesced, +// and to avoid bank conflicts in shared memory. This kernel is up to 11x faster +// than the naive kernel below. Note that the shared memory array is sized to +// (BLOCK_DIM+1)*BLOCK_DIM. This pads each row of the 2D block in shared memory +// so that bank conflicts do not occur when threads address the array column-wise. +__kernel void transpose(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block) +{ + // read the matrix tile into shared memory + unsigned int xIndex = get_global_id(0); + unsigned int yIndex = get_global_id(1); + + if((xIndex + offset < width) && (yIndex < height)) + { + unsigned int index_in = yIndex * width + xIndex + offset; + block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // write the transposed matrix tile to global memory + xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0); + yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1); + if((xIndex < height) && (yIndex + offset < width)) + { + unsigned int index_out = yIndex * height + xIndex; + odata[index_out] = block[get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1)]; + } +} + + + +// This naive transpose kernel suffers from completely non-coalesced writes. +// It can be up to 10x slower than the kernel above for large matrices. +__kernel void transpose_naive(__global float *odata, __global float* idata, int offset, int width, int height) +{ + unsigned int xIndex = get_global_id(0); + unsigned int yIndex = get_global_id(1); + + if (xIndex + offset < width && yIndex < height) + { + unsigned int index_in = xIndex + offset + width * yIndex; + unsigned int index_out = yIndex + height * xIndex; + odata[index_out] = idata[index_in]; + } +} + + +__kernel void simple_copy(__global float *odata, __global float* idata, int offset, int width, int height) +{ + unsigned int xIndex = get_global_id(0); + unsigned int yIndex = get_global_id(1); + + if (xIndex + offset < width && yIndex < height) + { + unsigned int index_in = xIndex + offset + width * yIndex; + odata[index_in] = idata[index_in]; + } +} + +__kernel void shared_copy(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block) +{ + // read the matrix tile into shared memory + unsigned int xIndex = get_global_id(0); + unsigned int yIndex = get_global_id(1); + + unsigned int index_in = yIndex * width + xIndex + offset; + if((xIndex + offset< width) && (yIndex < height)) + { + block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if((xIndex < height) && (yIndex+ offset < width)) + { + odata[index_in] = block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)]; + } +} + + +__kernel void uncoalesced_copy(__global float *odata, __global float* idata, int offset, int width, int height) +{ + unsigned int xIndex = get_global_id(0); + unsigned int yIndex = get_global_id(1); + + if (xIndex + offset < width && yIndex < height) + { + unsigned int index_in = yIndex + height * (xIndex+ offset); + odata[index_in] = idata[index_in]; + } +} diff --git a/benchmarks/new_opencl/transpose/transpose_gold.cpp b/benchmarks/new_opencl/transpose/transpose_gold.cpp new file mode 100644 index 00000000..db7fbee5 --- /dev/null +++ b/benchmarks/new_opencl/transpose/transpose_gold.cpp @@ -0,0 +1,38 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/* Small Matrix transpose with Cuda (Example for a 16x16 matrix) +* Reference solution. +*/ + +//////////////////////////////////////////////////////////////////////////////// +// export C interface +extern "C" +void computeGold( float* reference, float* idata, + const unsigned int size_x, const unsigned int size_y ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compute reference data set +//////////////////////////////////////////////////////////////////////////////// +void +computeGold( float* reference, float* idata, + const unsigned int size_x, const unsigned int size_y ) +{ + // transpose matrix + for( unsigned int y = 0; y < size_y; ++y) + { + for( unsigned int x = 0; x < size_x; ++x) + { + reference[(x * size_y) + y] = idata[(y * size_x) + x]; + } + } +} +