From 00f202bddb06486f7e72325eb8a1b0d626d7d46f Mon Sep 17 00:00:00 2001
From: codetector <codetector@codetector.cn>
Date: Tue, 31 Mar 2020 15:18:11 -0400
Subject: [PATCH] two more benchmarks

---
 benchmarks/new_opencl/convolution/Makefile    |  44 ++
 benchmarks/new_opencl/convolution/input.bmp   | Bin 0 -> 44922 bytes
 benchmarks/new_opencl/convolution/kernel.cl   |  54 ++
 benchmarks/new_opencl/convolution/main.cpp    | 261 +++++++
 benchmarks/new_opencl/convolution/utils.cpp   | 180 +++++
 benchmarks/new_opencl/convolution/utils.h     |  11 +
 benchmarks/new_opencl/transpose/.gitignore    |   1 +
 benchmarks/new_opencl/transpose/Makefile      |  44 ++
 benchmarks/new_opencl/transpose/main.cc       | 387 +++++++++++
 benchmarks/new_opencl/transpose/oclUtils.h    | 198 ++++++
 benchmarks/new_opencl/transpose/shrQATest.h   | 238 +++++++
 benchmarks/new_opencl/transpose/shrUtils.h    | 642 ++++++++++++++++++
 benchmarks/new_opencl/transpose/transpose.cl  | 108 +++
 .../new_opencl/transpose/transpose_gold.cpp   |  38 ++
 14 files changed, 2206 insertions(+)
 create mode 100644 benchmarks/new_opencl/convolution/Makefile
 create mode 100644 benchmarks/new_opencl/convolution/input.bmp
 create mode 100755 benchmarks/new_opencl/convolution/kernel.cl
 create mode 100755 benchmarks/new_opencl/convolution/main.cpp
 create mode 100644 benchmarks/new_opencl/convolution/utils.cpp
 create mode 100644 benchmarks/new_opencl/convolution/utils.h
 create mode 100644 benchmarks/new_opencl/transpose/.gitignore
 create mode 100644 benchmarks/new_opencl/transpose/Makefile
 create mode 100644 benchmarks/new_opencl/transpose/main.cc
 create mode 100644 benchmarks/new_opencl/transpose/oclUtils.h
 create mode 100644 benchmarks/new_opencl/transpose/shrQATest.h
 create mode 100644 benchmarks/new_opencl/transpose/shrUtils.h
 create mode 100644 benchmarks/new_opencl/transpose/transpose.cl
 create mode 100644 benchmarks/new_opencl/transpose/transpose_gold.cpp

diff --git a/benchmarks/new_opencl/convolution/Makefile b/benchmarks/new_opencl/convolution/Makefile
new file mode 100644
index 00000000..512dfd0e
--- /dev/null
+++ b/benchmarks/new_opencl/convolution/Makefile
@@ -0,0 +1,44 @@
+RISCV_TOOL_PATH ?= $(wildcard ~/dev/riscv-gnu-toolchain/drops)
+POCLCC_PATH ?= $(wildcard ~/dev/pocl/drops_vortex_cc)
+POCLRT_PATH ?= $(wildcard ..)
+DRIVER_PATH ?= $(wildcard ../../../driver/sw)
+
+CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-errors
+
+CXXFLAGS += -I$(POCLRT_PATH)/include
+
+LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex
+
+PROJECT = convolution
+
+SRCS = main.cpp utils.cpp
+
+all: $(PROJECT)
+
+kernel.pocl: kernel.cl
+	POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCLCC_PATH)/lib:$(DRIVER_PATH)/simx $(POCLCC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
+ 
+$(PROJECT): $(SRCS)
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+
+run-fpga: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
+
+run-ase: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
+
+run-simx: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
+
+run-rtlsim: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
+
+.depend: $(SRCS)
+	$(CXX) $(CXXFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf $(PROJECT) *.o *.dump .depend 
+
+ifneq ($(MAKECMDGOALS),clean)
+    -include .depend
+endif
\ No newline at end of file
diff --git a/benchmarks/new_opencl/convolution/input.bmp b/benchmarks/new_opencl/convolution/input.bmp
new file mode 100644
index 0000000000000000000000000000000000000000..e1b6c6faf5dc7ac7b3d8a76b9a8177726f5a6bdb
GIT binary patch
literal 44922
zcmb5X2Vh&*mFN8=Q4&?DE>Y|i0MQFkiQWj107(!8Nw9ZHqNpy*a!cZpIB{aR#C8(L
z&SZA_?9TSyd+)vX-Pzvz?(FZ}_aG^m>^JjWN+1Z3c=vz$IrrTAQl-bgpsDe=Nz?Y^
z*#S-aIrXhsy)HleV!!@h-jn~SeC3}0=CA(o;E(v#0|yRh2M-?94jnqAH8nM9hYugt
zjvP6n9X)zfYi@4VjvYIu9Y20tJ8|NKcJky&?bNAL+Ue7$wU(9^?aY}o+S#*bwR7jr
zY3I+M*DhSRpk2IpQM+{Ml6Lv>W$nt9D_U!7t9JG3RjsY9O>1v&*E%{nw9d{>t*fg`
z>+bH>dU|@a-rioVudh$*@9)<J1_rdj!9mSrGHGVBS+iIy+R)IDX0=*1o6V-#?RL%K
zaA;1aQ**gon%nKxJRXnc^?Eg*&!-I!4{LtEUkd~RS}+*YLZOfr4u`c!B%+OsjA+ql
zREx!8T09=t5{ZPCOeVEdDy5~<X)Tk<XxVI5%jI%fKA+b{M@O|np`aCuMQv<sOdB5`
z*Gi?5RxX#diHQkqa&l6enwruom5Me!J*`!%Rc&TwMyu6o+U)GCHa9n?&Ckzk^?F@f
zSXj^&7Z<gqr6p~7d0AUoS<zNkSGBdZHEn%;UEA2$&^9+WwXLl!ZF_rL+u7OCc6WER
zYuB!6*RNmKZrr$`?d|PpH*en5_V@R-Teog$w{PFp?%cVf-MxEPyLa!NcK`l;?ZJZw
z+QWwrwHID^L3{M*k@n(?FKUk;Kh|D)=_T#umtWRidF2)D)mLBDUVH5|?F(P{g7(EP
zeo_0<m%gOE{`%|M8*jX!z4_*w+FNhErG5F!U)H|zm9J=Tzx}rM)vtb4``XvOroHpd
zJKER3{&npe-}r|1?z``5Po6x{-h1yo?fv)P*FN~*1MR~PKh(bY&2MTSee{v`@y8!)
zpM3I(_N{MyOZ)b>zpZ`eJKxbh{q$4qyWjn;_St8jX`g@ox%R#9eNX$bANw)w```b*
z_TxYP<JzzJHNQstiJ$lh?I(ZoC$*pYsh`q*`lo+d`<b8l8SQ6(_Gh)9`?;Ufe*Wiw
zUi-Db_Sb5^?$`Y~?H7LG7qnmh>wmrW8-BxY(0=1@{Egag`c1z{`^~@kH*3G;xBM3E
zxBk}Os{OX#_S>}I{@Z`M_B(#Z@6dkd@BE$G@A_T8OZ(lw`*&-<=lA>`?f3rP->dz;
z-}n2p-~ao6zxD_Iz#q{5;2->h+8_Eue@OeofA|k;f8>w+5$%uu(LbvFu|M|5v_JmG
z|G4%i{=}cq{^XzhliHv9Q-4bP(|`I;Yk%g?{2A@f{@FjP{kcE)=d?fn=l{I+7yiOu
z(Ej3I{EOOO`b&RF`^$g%FKd70ulyD5um07)s{OUU_Sdw({@4Gy_BZ~<-_ZW%-~5}}
z-}+mBOZ(e@`)_N1=kNR-?eG5GzpMSdzxVgFzyJ6DzV;9P!9URc;XnL`+CTb7|494C
z|M(wk|Ky+i6YZb=(|@Y{vw!x_w1583|GD;yzxa#VzxWsbLi?Bh@?UEI>R<gU?O*@v
zf35wSfAep&fBSF$t@iK!-M`cR{lEYB+JE>D|3UkY|M5R+|LH&dC+$D~=l`tzm;dr#
zwEy~F|Eu=j{@Z`k{`-Ib@7n+PAOA!9pa1iJYX9qh{V(l*|L^~;{h$Byf3#ovrC-vH
zw_fgQIeYckk1z}ROBe?+H_wfuS5e%qZvOXn_V@Po_jY!6^bGWw1_n)oCbP+c`kF1a
zA**%B<{9FN-8wYnaPy<f>a^G$c8Ar(t3KC&-Rt&xU5+8M%j<F3tV8Iu%jt5sZT2Cj
z+wO2W9WI;6X0_Vg7VD5@$UI~nGMW1Z`}<7;roq0x-k#pRzMg@B?%vkEuB)9L9bG-`
zms;Dq`}%pOxAV%SOP8-)ZSUyn?vc4-a5~!C+uAyMF)f44aR77E+k2wr?3I=a|DX7u
zgML7R7>BQRb%4FQolouS?(OaF>FVt1@9XIquvi8MEklEYCSVS^90NAHt<P$*nOzR6
z+h%c@Y%VXD?G~raVRJhjJbG<jk01L5eOw-o)#3KItab;`JuaWq>vr00Ry#kNAjyyw
zQdrD`{ZQEi{(j6RI6>RnJJ8wQ+tc0K*KzquM^E1Xuk?1d0w4TcJv|Vh8-76{tZzr(
zpqW1(gpB?DeZ9|t`xT$+a&X|#mF|v?PCgQvLTM~`TSu2n!!%^Fnk`myPmj&Tq-_IE
ztJ&%v7;;#lhsohUAZ%c>+pT7s!|if=9G;-t<pi_C?S#!XA5T0U#KtLNfHc@$LIS6K
z$Yyn5{ULyPuy<g<GKc_}dd;0kKyQCfZ%==hAnfUHztY;#(?0-T`#M`MUcS=W*3m6w
z5&|$?AwXBZ$!r$<g9G62`xQa|@?SV~1^)AaJ-uDsT#3uKb@X%(48V7jX}~fxVCoU3
z*$`=`(`GXdS<H6lklkg1v%^le6Xag^@Gu_B?sM2ZKCj<x_j-Mh!-@9-uifFbxm_;1
z(`B`}-7d7i>2O+2X2b~%=<V+8??cb}`>gQ1zh{t{^zaX1=xM#u+SLnwMn~%<*xv?K
zWQIxy(14Ei?g5L!Z&EryMDeQ|I`7r*9=rt0p=eJZpADtEc<ApJsW)3J*1<vZ;NXzi
zWOdq{R;R;aw(_FQWpVqwHj5J|F1sCp_j^4KpTp&Ldws*hKIgF8i*mSKHds6~WOqAV
zZoAXrfCP4@(f|yAeaLRNn9Shq1HYvo7=4O|od^Ny*E`VLdgW?Y-vBT6!+#NfY=S65
zuPy>&1G)zZCx|Lw)c_*!E2Br}f8du696X2ci%n6+uYaHii_<Um$TT=)vh-m9`UWg^
z6B6JsJM4(O6&!9~(C)H{I6G}FzsCh~x8H?DbUNIAzuo3?c|dG;VGSHk7cXKQ>^2i`
zI~`u9&4UVHQ=oy>I?&f|k*W8ibNz$;=DywmIDyjl_qMgR^}q=z0RAhjtyeoR3JhHa
zm~ux4S}L|wWI#mV2eJJCP~(T^?7#dn<T%}qR59d}2e39hkl)k?`^_-BA0Hr;HKV;~
z!4O<@`LJd_yUAg7Ic=ek*=qASU0$!#>+^VA^5}M>@bDY@U>?v3rwxj^5qvwA4|`#;
zS_jR@0sol2-C`aZ!hT~1OauKW!eF;pv%dbmw$}C@?BhV+AolM{>(#c-9$rPwbqNqB
z(8u34B!J)iQHX!wH8AV{^jF0hHn(<m_3)7}eV|{yphW+GdB`+~crp>8>!8_ggZT)7
z&F-?>@NX8I!)bQZwiXtqW9A{Opzy(F#SEhRRx9QaDmYLZo83y3i?PEW@*$W)Y$?)U
zwOQc4jlZ(MKujP;uphej^$&Jgm;p)%{$BJ?oLdJ`H`c$GDZ+o90%8RFpe4byA^~Rb
z2eAIg9~%4@<}>(q=wcW4roSI9169O`OIr|rp7$erNY+5VMZEYB6L)!$W``YBc8pBU
z-q>Bfy|Way+pr2Is|83FyEisooy`qmCgHx*fxmF@*LJIjg5B0{f>aR3ZYHL%yAcPj
z(~s^UXm|s@4-6uJ@E`qe?GUpduXF-miN9C_NI<wCI@>(}&4mC^fIt6L;a9lz2iOk*
zj$IVBLeUhZxd66b=zu6Nv7VkmQA`VBi$((+*~STYOQrQ|TMsriAKu=mV<kln2XSiQ
z@yfORjcY3flZn8_>=+s{neg?NL4?HSa$0!-6|&ha7L&z9JY}&0(PEZsqG?_I_yF^u
zh4^dG)ZKa&s#w5p>gy!qY8Ua>&7l$ivD*XqS)qVP5k~MiZ2ym^;MGMy7XY3U0-d?s
zMr6^~*LL~*rEW1zd??b1cf+}hZIU3+Y#nkt%tJPZ$AY%_<`?$w-F$d`|Ni~OiSAwu
zx!Gp1r>a{Un=d@RaeaT<jpyn&A(*JUZD`15BKG4Wh#E}RAv3zt&rcTOAmzTHKhqaq
zkHPO7>fuk##9<Hv{DTUATYD!TNu(oeQ4#?DLBX#?fGb&kHTZ=9&uC-N;!sobvEwag
z4j(*trq}Hc1Tb+9r#Bo4hcMv8C&YpHeMo{KbD`=3{jRySTX%0hynFYB+Z&^p4;8xw
zDmyp#H(q@C&i>t!PvjG|>W9CBW^lmNL1Iel&>-=$FwBJBn;?!D0S5N37flm`k0k_m
zU&qyM>=1MTe_I>D4(u1vln+G$u-d(7Q-c5^0ndQ_YMzPplMwiMdbIR-0-@lr4|DAs
z8Hq-MKBooKgbl-3xrc+1Xe5C9>bKUmZ|&c@_uz#)JL@S-QrFchtpla`>-)Fw+<)Qj
zy<3H0!qEY-X^1J|DnU(W2l0Am7uL3yKwsqDFkWT?AtCV40O3G?FS0RYb~u3FMtB2$
zv$?N}YC}hNAD^exR45=(K(AyVLIK?h{{O?TU>gr22kProw>KCL`h6aUYj`9Y9SQke
z1VJXuIPsq^7>UJWp<(A>UsrPV*1g+z?(A*u-3<13cU(Go^!UKs^5*`XyAK{cxW8F+
zW2R9XtZBCdDw1M#b`of+*tM%$f;eTxka;n&Fpuvf0=?ZB-@zffr5F4?2%b1%#edQg
zl`AMaB?5rYw%|>~tm*_9YH<BIjy7JB@AC35drLv*Rb39i9|{Kp!#<a1I2w;d!+y8b
zGzdFzeJ)>UBo>cH{NV5K)v9Z|t1FA!uU@zI^|W6&)^xnLG_`#F=B)=WJh-(IA!EBh
z+30fX<@0CGoIOKT>Fk+v7q7H;bRq-Yh(jCl(0Uo84t)r7dwW#;kNpQPS_0;)?LC8t
z1+)P_f7sK9%qR)KQY#bKZ<V|f2{0(2@F_lm3YI<_`g4WVpn(C|_$~dX1KP=MZ#WzZ
z`iI>fKUkw90S~6j!et0_{o!alnHcfA%zf?tiBhF9S()8<Fo!I5fd6D~vM{~0Lw)Dk
zO8D%N!-ozYI&@I}A2@K3FZI~eeB$`AlP6D|I&ti1^U=eHj~qL78p5=+oNGCD^kTPo
z5X>)WmCfAScD21vY^1mc!Qa&@q`@5En}h&FK?B5KjHe<Xc!4kDhd8Z>qQ9e)LIgoQ
zQ@<@gsGq<e3Wp#7_`w<<33}}~H>ZoZ#T|&m65tQGE&c88iPF>*b+Wm|gsBhw&G0{#
zDNoPOFD|a6Ps@k#iO*d@zs^BOa!`mNZyszqeDuVrmX^~ej~#6~bl_O)fXhSVESPMj
zp0?Hw!H=^5KUGFif0O}psPKyo93;l1W@zYu&8h&(h1F2pS2GZ(az*_e^}0OBOX{Ee
zx_YLa=)&wrLIJ<q8;B>9iD=NH@Dsl}J;9M!A_acWP=BktRIXGj)3tgzLKfF?;b_yz
zfn=s!sm<5t=VQ(4V#Z~kUP3MdcpbhxZ#vR!;742z9B(y|Fst~Ps!9(QZ$P;UtGS<0
zzq4ENB)${{;J@G}`2s%#6A_@G+5jq-m*>xhJ|Ttk`PX&2DDa9L$2&ctkw_%y_jm&d
zh2QJIX~7k$7@=q)1^yuTulh?9Q`MQ7`trEPimSTV+|**u<i;wsISQnqV{#$&fkJut
zGKNCmAiz`nCypICeDJ{WHZ%CC9)ZVZ>TYZ68o)bB1Ofg|%pdrbIh2@?NWHJmEXk)3
z0QeHLD9m!X|JP`|)A+9DD6=!Bs5N)ELy?h4FyQe9lc`iZ7WCPN1V2>_Zzz^Xr<16J
zt-m!?o}8GdOxG92-S7wbKYYfV$&OCJ|N4CRh>l+W*yrFk$e@20{3lN;`9IQh@W9D7
z3+M$unU%Sxy{!vBfB_Kv_>f)+B2Wjh0mS5F0{vzu(gFp<29Ta9{wpm1FZ@5^b*5rS
ziV)>!n+yE7J&!k>1Z+Iya}uV4-{ta!<GhrLViNk>!{w>T$@1iMwdi+P`#UcjJ9>UF
zoGwn!&erB;1JC#`#8t!iF7n^d|3illH=j5S{u9T+fAD1ckcX;22EgUCo4VWEyU7;?
zKk_d=Vo>l)s6?o%SG+9D3krw?kXj4>4T}E=oImtZ1GFxs0$qvCk*ki-2yUMeMmWV1
zg2F#!LoQwJ;YcE#$)=(quYI5`I61|N!Nl~;SWx(L@%YgzreLZxQ>)F+R)Q_MM9D|W
zjO0NdBL5BXKXCBS;iJb-o<4o*M6>Y!bcYrA5CAwL|LxsW)`*P+Kk<X$msCegfDix;
zuu^gr0tf{#0fJv2I+!1V+?a+;NFPt9r^qI+9d31mRM_M8Mbeo}Iv(~>%(GF~qL>j$
zX0o|-43QY@h*c`($;pXIb;54}|E1&2t(I_RVs@rhuUAJdA7UQzc>-LYl#W`gzrla(
z-ysy>*ol)k!^7Zj>9SHX^HD+YxP<>565fH|X6aY;Kmz8c$)2(RgEl2E4FWjra>0gh
z$<-NZ8UjzIV!SQS<(;RrGM@12kyb|}O4t+d4v%EAnRFuTqw;4btM_>P(Ns2<OGgoo
z!S-aeQkg;psuNBM#a)+AoNP0NvgMiT%=|(v(R1{me17Ay#+CRc*hK|oT0A*)_{h=b
zW5<pjX~HkIblX_$8s<uFx5D2?t_b_>=KdbZB`BE_=Sj9lsv{wQS@?nkpfX~goelw|
z;}?L>hD^lxVr5XDRpVV=*N6PEhWv-YkB#$zKbuZQhNYP6g)81b4EBScn9$Ujo~cYt
zPvZc`T^5_U>(Z%n-DXc_67{duYpI@Q!LMJ__;djeVl|8_>I67)^a$SX(1C+zd+dUL
zSVY=B*xk|DN2Nf@1Qycbp58&y1s%VnItTz&8Ri8AFapYHIKiaDX@F~dmsb^Rb!B<g
zkO2L=@n?L4{{(rlNKp8n%ViT`KV?h@bxn^i7*FT&xlAk^AO$YW&dy9vlxJqgsAO8Y
zFQ2|RV0EO*)tR~ZrP*}f3B?9A8!l}O#rOsRxEKm>=ui_2v`xUL64~qY5BtH-I*Q%Y
z)!sEgshCos4gQlYfS+8qk?J4-gBB?<i^Whz0282qaWRFefxYovy)F;>P)`iHsQJl+
zw5BVz5IGJ>-tb5^m(9X|H<^zYu^bL1vMi!zc++P!m*;0@rmN+t*`gFRx?2UmJ5jF9
z*5?*#IqMl6zf44Cqb|UMP=FybR6k(>z%Rlt`1@Qe^!f!q{0Beb9r{l#VxWf#mDEx2
z@gf350TKoeIftQ_Apv3oK_(yj6k`MWbKhk)&yA;3Qxnn-9KLLeL=%GF7X@uDH9|e#
z<{b9>1%GaIRPcxVc6VuhzEZ8#X6D9S<g2|`&s`a?`qQQA+$<7M@LX&XfP5&|GSCWo
z86*S-1z>;E;Uh@DBEX^Z{Zf12N?xDKK1jer)s+ei1vARQ1pmNSnU08nB7l`*Gkj45
zP;OIQOkManh(G-FdFtrwQAE&L)pW@kj>V%9KlpQkKcetc)*r_G<wgsmnK<F0BUGx)
z%~q;VplCA<k*r+mG+QHM<0!;@eQCnqc0?Hfb#3+WN*(}i#Q*Z6f`61e=*VG`r2~g9
z4tT&H5RSTS0|Y!~Re?eU2z99sTUl?CWLGzVNPtc7Geiqi9H7{5Q2j^27P!yYFMLsK
z*IA|bbm-DhIF^hd^}dm8eiUmLpy2EBaUtAaz99HVB4KwhUo2Orr)FlWX=-~?tLd}Z
z62t?u;9s1I_BP9h8=tO!6A^C=xuUqg!&v|0$H)XR08QWz1cC<s{!WU!PHN{Ao-Nc;
zP=A#f8mTT8&}_1Mkrp9=I885J6U=-r4?hfEz0v?Hza_sQ5Aw_My76}h4qhB0%8i5H
z8x?6!M}w4mUBj>?Fp?TA6bj%Ui3}%86Jyg;71F)DhXPG+M<=DI)Z|2UetuzoaXQki
z9DqI<!OBoN{!{>XL8Slau@fgw9wYoYbl~tMlULzK0Nl1d@Uw6%HFM-&>a&y{h5wT7
zOIAQ#z`^ee1(XPg1Q>V}>2-_>q5`N8LUSCT0<NQ$R~0b~N*ugs3CB~(Xu$1><_ZPy
z2LwNIIjry(1%EUeE>^L9)6=sH^W`Ag*xS)F<ZvZ&slv<x2~Z_!Y7q$+!m1%xd7k@9
z8^rq){hT^Q`b+wE<ci5hgG*5GDE$2_C&+@S1N@W`5r4_^bPW&}I7C4T0zd(g0GxnZ
zflwS(l#om7w2;?yChMk0y`x@IZ>V1qQP2)v7>bbO#Y1lIh|>00$m6to{D@^Rni-{O
zCKHdwVyP1DuQD?`H$OQUbJ#4s?cEl~@OU9RQD0cB&(*TF)5ID2XUk<|aAo1X6h#>F
zg0TN2@h|v~2!6|OC>SF52EVnpvuBV6H2BZ5BPCC2kSv}FeqIs*pelh_LIA%IK#v0i
zmOc!+H-40<$ZXV%geLM--jt{6pL(htICNoXguE{ia{Ho6+hak&A7EOcXr?$eUIc$4
zPN2K6MEXBFJ3W)KTM1UXP1evf?w~$9Lj@#jKi8x_SHFUQmH#qu`8xaqBL2ruQvW!4
zycq*{wAC^Ud?@Xw=+@WSW3qd|Z>Oh%qBrU<ONj;n)S}Oj%P;bx1OP99+ce<*GVtnk
zd9?vs|3!Y%X(4lymlXd;Nb=$#mp2Cf0{Tzp>j?xyfp9Ea93LOc#zzvV+|*2EVR^nX
zHB*~UTP@)48nBF1=j(Ixv-OpYm7MqLA)UbLk<VA)<w4-f&*=YQ@Si$;riE%4`A_rJ
zAwSJs$UoIL@b{V=WZkSCI;<2@=`x`RC25|L00>~Qx?xL$0D|A6An4eB#N&@3iz0wX
z149w#;CahPGLuRKoSrDsKAIT`u@>MFzC_~LVrhIVlZYkL`I)&X-2e0}!A{an0jRyR
z&z7H=pPwWDUzncE2RjezpUTG?pD$b&?ko8hdBFFR{+>B|1_c2Bu{Nup#x|<2RG)@=
zyLv57FS5sSo0V7}@t0+0J=Lce3ju7Ds!<rR0YWo;fPudOwDDbNq2|%xzdkj2QGZKj
zsLvDp5#hhz>5k=#V}(pq@OuLkg~RdOnBspdl^?4u*QTcD>dUJOa|P#+x#w!vpgC7t
zsMi<igIlOi1bdDhRKVqu1_49~AcA25pv4i&pDkz4oo&(aw_Ae>KjmllP!H~pP9pg2
zM*fNQX{RB}k+o$6K!w3}N|jQpL<Rf-@c|9_H*hJ`@?6I*U6DFY<25xgc~ebPQ9wIz
z&P<k@P6QmT_~=-16#O(bc>-J~lE{yjO4z?xx>%ZDSzMT&nxz47anw09(Aj3P+X|BV
zuC6RCEw4_8x{n$kXnZ(V*3k<Ao|?bor_Y|haPG`0;s5atTYx1ega6&#eO8xGIt<)S
zS&KE>SVNaYpFB?*iHHFmKJZJmT1mk3_;s!dYz0qecf<5>aRpg%<r#S4693Mc!r)KR
z-w-d1jTN%-5G%gOe>gM({&ETZkCPZo)aI8K=VzCf7bnwh8>`k%uQxk2Ge0}OxU{mq
zR*m$*e|4SbW&rn}6Mw2-M`_r)aPfT0X{rAl@304H>I{bh!C|+B9vrfJ<R2f%3Iyhl
z)*@UyHUJ}_(g3GV;D^xwkpK|@CC>T-&>8aB-~;i7p?g9YeR7I2jTx%%X9RyH>9;ur
ze=!#i3jc#JW+YW88Tj+5;zVU`er{%JYJ5B&rb1xz28NTPganh5mD<8WHQG-G$VHTl
zsjG<e>G%x?psJrO=Pq2laQ5_xqlXWk=yXte20tNy+tS@jN8K=eWiAi+O=en-l>93R
z5FbdhfuG+O4G<krPEcX~fd`p^OoD;IH4LwiMa{_|f<X@T+L=M>ewpO3)gH&%7lr?H
zbqM}wx=@-ZBmeR2==fx1dTy>(p#)l<$go&MxtzR#`bc4Ha%OgRGTwJYK3q6pTuHt`
z13dwdi5$fIooqRG@#2MZr;i)>DLqFpX{@lYwoTSe+Zd@mi_oY)wLU@|`mI!{3khI7
zLhu_VK;mXWsjwOkY6j{>H5nnqb5qiPo0+Id%AfEI{Hbgv>9g3tUmD9%0CYKhA+8il
z6-yHn<C#Pvo0*)SUs_q1tIbr36LVt$T#wHm%i|iNqviSK)ur)~t|L1C1;0Upr}{6V
zAo!0SJ9YN_#Y-2@BL9aEp6YT^c_q0BDg3xU?=Vmme%6Ua{Ea-HtWX4i^$31lCxIRX
z2!>x_2v{{GnUwrDWI{dXy_SJ+Dw9ok&GvYqR6_p4G%o7+(?x3D=zlU_tkChiGFP1}
zWQwy(Q~odwVBX~PWGNRP$xkh;uT4dIjvB;Pm*YA*_ZtQP3P|*O`W%ad7tXeb{GaZ2
zhiT>(iT1lp^b}CtkY#u;-8QtKI;93D0-$REL7>790dxrv1pqsjQ~!Sy8DtU-?EI3N
zpuv6Po2Kw*vk8ycmMAFxhkb6>u++q38HGQSNRE;AOwUXebE#ZuZh3Vknu!d%d<ndN
zbuyVMR+qOH(*4cwU+Ak3o#RUXbt5GBj~++;FI~QL{>;f{@b@VEBjFG=r@`)Ci%XW_
z8~BNZX)9FaZlwU^Mcz;p{v$BT2uc*7Knn$)4FyyE$TMc4=A?*VkfJe1<NN7>P>Qk-
z_(lGO|I+Uz_~V)41pLn=GNtLc+IS{gn5fj3SJpS?^QC0aJCd%itSl~7C#NSX3k&&y
z<0=6VX6SMz{8#B$!w4Ue;=ka(c>c_ZqfNA4c*v}xFq9Sc0oJzN;CDKyJd^Fwa}57g
zsf#>U)_cGomKAzYfQAKp2K$GeDJl?ftJ;eYK~3t}<P2$Grl<NsN$`6Gzb^m7ZowaU
zia(hxP1hG|lbHGCwbk{l?YYYASUi#~FR!n!EG^gPXKJ&xG5aZ%Xa0~3=+;l{Al8`Z
zPw^iIc(S?a(Ahp6KjJ)WAL!{H@~{d|8-~cg;HTE9Dt&5w1OhmRMO~B$;7=q7(E0yE
z&<&&l`H)oEA-@uK#cz2qh@tqTX{Y*w@INK`Uo4f!a*6P;$AxhpiN>?z6H}988KS?5
z>g>Ym>Qa5azP!G@vADc8lN&A453sSaT(2)JuCLT4yyp)I(dF|SG*H%GIbn*ug8$gb
zGv_Z}M*dq)fd5>-m&{Ta8ur=gxw3lwxL)@#{3q6Ry1jamt6~7jgIxYd3<B^6VgQs2
z{AJM93xZG>E&;$nqCj0Ep8;1t8^5BR><uQfxs*@gmuLt4<UcG5B@O)PY>qf+c70)X
zu2x%G-CEn)*q$xdDzmHGTPurmGc)r`>$S<yWfA~^uIQ{^QV4(q82O*@|LlcJtyk!f
zK>iP%AMiz`^p5=dY<)fbHih5grx?Nt96hHh$J2cv_$f+6AeWc`r2!HG2naQF$of+u
zzxpR@Bf=79B(qXa<v}WS>LvND6TN|SE|(g%!2dG*mr@W-t#p21?4~M{#cU>*FW~n{
z{wlNerPYm%t=*l~1;XByjjhd<#rn+j+(Kn6bVVgVOhK+>3<0kPVbI_Z$^Y4l^tB59
z=BB0#;E#bw$4_gM&rj$H*WGknVg7XdxLUPBEbRy*f?p9pDL^owgV(_}o<IMS!mLAW
zkc7GE_~kwR9q;i2FzvV4V}){g9Q+hOaR4Lnbe@if$zqNucye;8JYB8Um)2GmmN&Mp
z?QX5DudS@DEH5t+71!sMrt_hzI{y`^8}Ri2NKXI{G$H?IFI~CXdietQ4>w&h4aX?G
zBLE?<js9bYkIrJJk7SrdQ0xE!t}1h=N+<H~4k&Vo0x*;UJj2|0CSV!*qhf_HLw`Z|
zB0uq<n*<@ju^xXqH!AoI{a5(u{E24>Z{dF~Uz%B1n6IJrz^#?$*SB`}Hr6*c*Vhn!
zc)v*gGdmjXJtlNj7uT=CHB|H`tQSRSBKbdmxs@g$@E>hzx?~Q<8Q=+dt-XC_hcBSx
zchM~C@UR3SDUPc2sw~JIq(Mh1fT#d@0Gq>%VZb$j>$8v_bm#(KenTPGUlg056Q#L(
zm{p$iu*DIF|6{pigasf1;HcoQRET&+%QZs0#g)ayh1yhMbY^W6_U~-6-)nw)s;csz
zjm2`-+H%Oaq7GkpuZzF#garTT^Ovu-w_Uk#<|OzpTLeG+2mcWGUBiCD|6%lB;fMdK
z%q7c%vav?m5#s0ze;^VdQSgtDf4R2ezC2+D;N($3Rb)^!c<w=K?(%^@HEb67muQc6
zCmWe2<{y7A{4bVg7MGV6SFwI&76_^v8*3|D*LJqIHy0;Iixo1!<+YV+*>m1l0#Ov^
zvXXx^C?GL_xc>_@gtfOS{D)hI6#ru*lpHL*WdGoIu<`<a8_}PX;6?s@UOfUN`-A^V
z0wCFlvH%AA^=}IJa}UfzTpt$5V79!+e;y1*$=@CA@M7STUX#*)9lzD#3yvfd{)y3i
zX>w+9slKwlSRK!&<Jl=Z;pVmLyVv%1R!Zq$qFkM;FD_Tg;dUZmxq@6o4G{oiNGQ}q
z_Ir`$^Uk(QXOaJ-t=3RHP6jbTgN+&dEN-xKg31Q??Jh4%P>njTGzbtFh2x317!0w2
zdJt^j2agQF_5Af!{3jupnwS2Lj$Ee4GkNJqyPGU7?j3N%its-t?QY;F`_C07E0u}S
z-1tP57-xRDR?cP9scd;}X=(e$t=+wAyBnpre>gr$ZD4V7JZqB@xO}MkV3E59df~pL
zKPOu*wRLuNwO>AW3j9}X;RJvX=b+bw`}YKf34a7XMSnN!ml~H514tQQctqGQDxgfD
z1OW>6Q|N-;xVnOA5JazZD&i;_=sKZj#jiD8b)~ZTxJU3$O8hh8XXlI9|6Fl$dU_(C
zDU_$G<=jNAM13%q7#p9QsqgIFy?uLc@A_n#h?FwP^ukP~98_z7`XqGzi|jSA)~kmA
z^uC?B)YjG0-G2E3@n3VhJp%p&+zh(e3qwUgc2KeKpzy<fsd4CaUP*%7{*eT~4}Rc_
z{6m1J_@95GBNYlj4E0aVMp1*8;DoXXN(NAoR#!5YPk2pE!9Nau8UP42k$=HInNN)p
z7N%pla&c^od|-UMT&Q2WbMMaG{TtWnrSW3EFkUE3PM3?}Ruqk^C?4?5uz!NTVgFk$
zU1j%8`<3&jnhziAZ~{oNGw2?qwa*)58I%<<;xCF|!)j^N@PNX8w?C@zD*_NSND`!o
z-FPr)AdD5*iVQLr`AMJHvv0_w@$#X|PVmQv&ETKV<=;uAB@~J0$G~6C69!QQD2*0N
zQ?u2Ha(OB?F~7Ti`{4@@@7~;9nk<f0XJ^Wj<y<k+P8Xme0N-RN0_aM>Qz`%8|Jdqy
z^~yQopW~f^UkCvI2YUzYzJOHTg#Q#lJ>XZBaakRgDu@gBPcx>N00|l)04_jbXgu&;
zo{H8p^obbk7Xs+?khc{*)GNGp=&~c3%f)?WXS@h^vm*bjv#RRn6bt^jR2DZt^h5K{
zRJAfw%@%5FH|{-p`HNqE{iXZcGleXLp}Fe#Xeq8rph9=!v!AyB%!vH=#2EsBt5+_b
z1^<aImtrXR1Fpe7lVceEV|a!CY&!uy362T@bo`#6G^2_JC<zb&(DCa+uCU9cWtEK;
z_NPIf@Ij#$3OqMC<F!MV9LYTP&*XZ>f21%h{I67}%Q?cpiOES;ekx=-a}(5iR`%~b
zdhM;Z-~Q^CUwbe&RW41<EzeiW#Ymej|2llsOfVZmJ(uoZ5+JGnQU5&E<&J^}IMJYM
zpwEoWf%B{gsv@9Imbx&2esO?!K(`D0Ng6bj1yCly;J<KOabEFSimc7Yk2kZvX7GVI
zH1I!VOoJYLKX}Ow|D)~!XN)MH_$TUT{fokUJYN$0`Aoi)&(Y^lonKm*uT-k@tG8cx
z?Mq*N>&>rz<?Sy%+$BVwtSzq0Pv!lsY6(oPYmh(>f5ZYR{7r|OkDqFx{C`U3cdFYB
zLzCjr1I_{5zduCgDfo#3G5-O*#;F_t_}!im{ke(&Dh84uIHa`LfDaL*zwGF-<Mdlg
zeL+An8HL|42<ioSU_OmmUa%(e*@$xh{!6th8Djxh^8Z-AG(A%-=hHcek{hqiEicvQ
zNcESl-F^AZFTeGbufO~Cuf6`ohpYAFxw-kZrRhSbQ%azsTn2mv8Uc8UpUKhwEA4+$
z{_6E8hC%?pqrZR19iXbljth~0@C$qdK!t!x0lWr&u>k0dj$c@-_%4@J@aZo&eyqWN
zH4(!W2;K%}aRNF?4jec?6whTMjy^}Mfd1!_u^?I)3ODeh|Kq~{TD86~H#;{wyLj`Z
zH^2JTZ#?1LfcM{h=ZzQk5dQk|()3sm{5p6=gogY#?7y_W96f&G<Vk7&qx|2}=M{z~
zVptq|Kkh#mW*-hKoPOe6k588R6n<Qw%7A>~M9NSAF#v);OnpaJFx{qMV`;Icze@T`
z1iN5Vs1;Xq6QRrjtPwF_96W1@<uYMMza!ef@3#9Ww8!%0=~}gvOJ`{UnV4T#sIM&4
zDwX+-2d{tmYfnCS|GjtLfAZef-~Qq&`>U%n_1Vd>P**c05UMV6G4;`g{_6@LvZ+>o
zWiJ5pJJat=BvWG1qkdc8fK~Qku$!3mP<GO~huM|$+)_XCPdg6yF@SmuAUP27PmPBm
z)=rULL{5f+-e9+SM@>p5Ch?#|<}y_oXH3zo;J1T+ay*xc3;)9-=s(S_)1_=WJ6ae=
z6J}Wcs8r{+Zoc%Tx4-t}{cn8by^lV8_sQ43@}<Z7OS7}}>FKof%wd^@VFiT$4f7`k
z0Q}7?ezW=E6xDz55BO<35I`ftwtjK{5%wX$_F?dg90-0r1QPq_9+q`_MF8ahqz)E_
zqlb>tQAG2RfKl+clBm3pK;hFL)l>a}cO(Qd#=#cTNG=_A3^<~rQ{blq+T|GL!=w3$
z>P&S!myyEf%-l3#{#<=&b8GAVo9_Vqo%cTZ=z|a6d-C<Szi@xEJ~Lmh7KU484e&_g
zf(k#pHxvN(hxtEu{yfW{n7>2k6#kULZ>6&znG1oR%}9j%;MXgB$^)on0KbkuiT*43
z7ybjW>1cCvGuG}P7ZT~GyApylAEv)vO$4hbyw(LmFbid#8wXDfMDpMtw2#35G2y>k
z@P}iglPq|H9}6%wLk_fDpQGZp`r!4qpM3MfC*OGgqfb8i=9Bk6c>9YFwiagD8Zs7Y
zKhkudsmU-hTvl;H5kQ`U|1_(Am(HJM`Sb9h^CrK+&=H@tANP+xA08I`u-i)mf~xQ-
z1E~0~@GAij{1^cE&!-*%zBJSdYL2?$l$(>MXt6ndRI)x9x&shtkl5Floc@-2JlPk<
z{yD)v3jVybf;jz{7x+J;<4>eX0qe^P)cIDHZoKr?yYIjEwf8>x<l7&=|H1q3K6&@;
zS8gq01E<TeP7DAC{um#w2++^~M3VBq!Y}S01vqaCq;&a@_=Y6>f&Vx?7DIsRQ}Qn!
zP(?tp3KEDW)1m-+45)ITkgywOAH(J!$L)dpbj#_MmNV#BGa_NI`cNY-XKKdeo~g%y
zV?9CeyUg|o_{ne+VUH^qi-n{4NgY42e}R2I3v-KGtE<-@zVhbRzVYP!?|k>O&p!U-
z;}70{@{O;5>A`AkVRm*R(R=zpGukf{;Gd3P<v&XQ_2rMtmlXb{Ll@1m;X({Rg#9IE
zCySdw1HVT%|6&0ZezE_aK#bp3zb`QpRRj1h@D%~X=PBO;3Dh`#M414Ew;}L4dZEfw
z(jXDdU4g8^KQhYdu8u!MM=bIW{;^ypMPp!jc9Et2jg_@qkG}ZU*FX5=Tc3ab$A0X4
zpMUhp$KQPKo!4L5rW0spDrG)-@T3q}2q5?k=IaIk^LJb=f9U*gx@ZZe(`msU_LvEO
z0^~Cd{9eqzq5qNt$|{gI7)zy7Y2^V`(g6Mty`&017C;SjK_B8^bum;^F*qXM6m-!{
z^)`=3JA9dRgr@FDerl=&{)pETX4hY=Fjbos{K;^pFg?4py1BZ#x^eUV7ry%8C*S>)
zQ-41H^z+ZZ_ucP)^1<6*xV5yjFgu-dwzM>~p9`t=_h<4i_*>4i{CEDGIDn?hLm@U?
zAkgu!%QP_LmfjXWeJ9Aj$D`v{5s-o37m6eP@Lybj7(l_#rxR2_1L>%448z_kCcyIN
z2%scWpo<JB(SQg?u6k1Gh}&Wd!+-E6BR(&Qnc$zDtBlbTAI*%h@n>y$Wo={Y`ipP9
z_pR@K{=Fan{&zq9?EByU?DOw_=UeZ6<<YgprG=Ss$Hfa=O<hH(t6P43T9TEp{KM+6
zlK&%3O;?6O@E?Z8L(ahgtB2l>AXZQCdkF({{ww(>{NpQ}P~xJ)7Xr|hEBus(V(hNq
zlq)N|I&+^ve+rga!FxRhP|qpLU2!JUBi<ovC^uCp<ul2VVb4f15s6dU6a49PELE7Q
z)}-LSzPbDOosYls+4ng7>C;a?`~0&{zYG44-+T2&y*^tTx9T-;K3E1z+!Fih_WzKi
zzbDxdYT!S7xYZig0mXyhw@Ggg_;CWPu6uER$UpW^6@djm;UD}L3n&o~Ez81x03Q^e
z$J)CLk!@Y+kO5wj9~IzdZyUeFTl%}q@RB2ujrpvWV0Nlf9?hg8!@g)bF{1Dz|L{MS
zpQ*2`t}d@{?A(3nosU2L{IkzK{p4HU{`jMhKmO#~-}&UjCtu#H*Xs*qN4u<n>EqzB
zCYg!|qx={52Y!~nE~)m1ro*kaNG2_K;sM9Npbhs2e&TM~jO-KsD+8bcAQZymlg&u-
z2O<Cok^g8o#H9$UB>)sDYm&haq7kaaoCyeic~oEegCbG`KkuHm#j|n0%@W8~rpu$*
z^vLjVG$Z&W=N->y($Q#UVqs}vWov6|XaALNeEjWCzx~NaA97yUhaY_O;fL=(dH=(2
zytFjCJU<b>sxkm*pfw$9utLW#ih7LnU-2LPZ#r_-K9WvnAVAD-A2ive6)uSQ3w|$Q
zpsa9;|C0nr?7wWr(&YvC2!Is9f<P0<iUgttVhjlb_>u@y=r5e-QIIR1^P}Kz*gv7p
zITD~`z&bdbof7=)02q#ClY)N^`R9drJYA{J%`R?iZ0<jJ<NXgl`sTO4^}z@4zWtTA
zzWg@+|Lb3W^6p!^vrDz<<XI!=(2g~WjtNs3Fb7#+`A2ns5dYx+n~t<OqY6+eHf-bI
z2tVmO@R5J)ALg&I#3}wC`Dc3uZCX%E=fB`*v%Z3_(3AEU1kgQ!Y)6(UC}9L|Lo5Zb
zf-jH!29!8s9ufQlUW5Np|1gcs(fC*u{KZigN7C8J+RS2od3$63@mHQadH<WAeEj4a
zZ@%`@%P+n1#@p|F?VWeO@#XvTi}UP)He&yiN7N_ly02tY!XNMx{#^t=?QcgrTv3ke
z6aFjw0a?)kH9?TqD}Axb0Eqi%5eNc+Ur9>?zhr=7{|%|vi6E+gGVo9Bun?x}z?JJ)
zPj&G8ostP5LdzidgVw$V{t<s5NgsGj;V<M0*;FQnE37XpZSUNA`72M}fAZnGUw!kH
z#~jl4=+R5Bz5ez)?>>3s;qF|$9>1X0Ko6ecn|>V~JC7>-1^*>hza{<^{4u4^si@D^
zZ*c}9h&$|8_FuU_(SO8Whfn&G%A_+GKotS%4lpPpuHt@ys}HUuPXxHa`;77WGZB6H
zGoAVJ?DRl5n+n?cJQ?J_kjq2^!4!Rv@PB@Osz6gXy8uhGbMSv@|N3k1y!+vkx4-bx
zqlfqJ-n;wY;lmeSdgaY`-uc=q`_-jdwpE${51lroPPo$;s`s1lSMZ-XdF;rc<}P=f
z)C!S~daZ*)ZaIZO)StcBKB8X{|3>^L<qwY^{JOn-8UT?0DDB1sirxueae4aH73bwC
zFY%%RFJw`~kq4a&5aLvSD4P!3g#T6aKN|~#vc*hH@XrbUJSi~yJfXnC&fY6u`|$m@
zUVh=hy}NhzZ{50cmlH7_z53<1-uTkB<@s71`9E;@oXkoW0Fh3E0#Cy~<RAP^$GSY=
z7ephzA>6-Me?mY>0u=s+{X_m)|5lq3#axKPFajWUEsMjK&u^mUAkY~kISQPbfkggl
z2qBcV%bRL$iU4YEbU+XQ3;wC;3E_V*k}qasi7_&M!9SYGWyfacX#QVZyZ_dE?|${w
zNB8gEx_R@)wQDzS-nx6|{!3qY<IOkrRwifS7n%+nJ1ZB|NJmBJ#rEk#;fMd|zvBOi
z9#1^WhRakc;^ibAFMi)I?XZG>82+pEO#(v7KjQwpeypW37Z5;|K*atDTlsjU?FzL#
z(Dfoz=NaFXn2WG0vIu%(C_PbAJl-A1Wh1tM;T-%2e=HQqvksUTpQ#)C&*n1(__LL{
z`rcRHd;9grcW>?O$@=H!=FW|4H*dZ8_~kFX^2&0#>OXm~<s5U;4vP6@CeID<6aMM?
zFY<q~*O!3*w4O#hmI1Q=AQgXA)K~a*`={4G$Ok=={^;VO>%R&BRs1K{5Gn%}Pbh{)
zOrRJ9o~bNAAvOd6I`EE6O*j5LKi1{X=Oea3e-8c^+3yyPjMCT#|EX$?<_lE7G80k@
zARpR$`|U5j`uOIxoz30#jkTTijn&QV8xQZj_|mJ7HmfC9^W`(b{pO>_HDqF9pJaLx
z{!92L-Ct~fIN9e*WR(Dr{tVdtY%=ue_yg3wb?Yw$P?3KfzaIW5_Uj2iMA<(+Tfd0J
zK9cn6*HgmBE+ZL0Ro~Tf`E|t_gFy$IJBD+)h|Ltti~NH>6fKr=3E}_jL}9czno4Dh
zY<-<5SEg^i{`%t=U)-VZY2(_B-CH-e=mFWgv-je?$G2CeOU|BC>?mzHM|nfIA@h<4
zH5C8Bf0_1gy8jNH>Kmr-43?%sF4LeRzz$>|UAII53crZIVg83D`=|O3eo+@A0FK4b
z|K~(ZF38Xc4;4h<g~`tbFUojsPWqo8Y4^c@$54nUzg);?<B|ASIhRO|*WiC)bgYn0
zWJkwls_cHBT)h3-OSf<BFE6ZJyLNMb|Hh4-)#Y2)?%#ZP`)0i|8EiRv>cZKhhOL!P
ze}*6TFU`M~E??B~xAgnzJA(j8(m#_kNb#HFcBH6J$&2tu**_KiO8Do|@r(V#{80i?
z@*h%obowf6z|}7<sKpXKL(ena0Lrh+)Pxjj9BK3B^I^L+Qm7#RvfnXL0zdp;5dM#Y
zKbg%>%~dLsV^drAU%tD2e_?U1wy?ReG*zi@@7~4_?(N@Lnji1Gbn%R8f{=?fK3>0w
z+WG|k%a_1U_v@jS0Y81Wa5PE!XLbi!{GjVvihATcBL7d_AK@PbF*YOf%Lst%R#N!c
zmn!hkXD%VQ8~+SfAX2XrK#X7m{?lnao#5eCPi{2qbVQ3&RVDu;Z1&5=Q^Nm=Q8q=;
z0h`Tp_Qk?{ZgOLH@8R~^c71uVTB^*h@7=k7@4;=3GQPRLGCuBZZC2!$nfx#Tg#Rl1
zzex8l@_%L^kQDJxCrJM+?oi0*_Da_^MG#hgi3SY$XZ=guzb6<M^l4E5iT@4zhtpGl
z^lG+#aa|JRQM5tf7D^aHu|sj*VE>_}%kFGG>~O`#4E)j5L^+TBXJh+Bp)gJdcq*SQ
zO%^LNGc_z8%?~$k?Cx&NOt0?jKe~Va=Jxfyd-w0IO^@{-Yj9t|mznARxTua_bw8gz
zPWR)P!65uc0pdZs$>I$MIZjvM7x^a<YUsb-03-Z|SWhzm@Z<l~`aigY|4I}TG2}-v
z1=t|jW~vzR6utbmd_A>G{O%=pE+2Aw5`w=-i%TpsS<dVBuUw!hJQGb9^QEz|ayGee
z`})fK#>Vcg{YMY4uf6vAs}FA8zI*rPz5BQJXVTd&xti#_h@D<$kjbbaTfc<=YV{NO
zKW7SZxCRmc{z0pcGY05yrn#B?k1Cj;m-@ei|0?{`<zMhi#e`Ku)i)-Bro>-YX=QO0
zc1Z`wha?G7oEL3Z(B(lV&(oP5y5bT1zT`9fQxk%}!T)@8Bs)qgd^wY=PtVl1w$@g6
zAHVTtZR+tGFYauuK#k40$%$Gv-+STg*)ylu->dew$_4muT<37JO8+GMm-O$@Iddot
z`?Hx?0Q|!oJ0YtsBLBoeO8Z#}Q@flD{a5L~5I_|GL;%o#9hFXDc@(w_-_@YV)W9zc
zk)hx#kNivknp%eo`Ji(+t;;{@Zf*+s7yDl;7mD=6#>mw2bib4q>az=r>({Sczw?FH
z?^ZMQomXyd?Owlq_x9G>;!Le-zjR6*FlzvqJGIIAnVd=WQ>A|wNPob8f%2!a|1m$u
zMEiqYr(ITE;XnAF<G<?v!T^NC|A8L=2Lb4t68s9jJSZv)E(TPP%gVFo9Kcfxpx#!$
zCV#{)9%&DbgFlciP9Xm*fFwp~ds6skO00d2<)bXU<+IuNcx7sKVeR%(t-kx(YH4<6
zyj<Jg+Fn_`dw*kQVze;Pp~!vom|6ytes9_7PE01VA^c<Sm!!XH_t&9|Lt&l&(P68}
z5#ZP<JG%~){|CO*K4fRGTK>iS<Npo)7ygU?L;e;2MZnaTP74uqT`F|4=sefo)5#>z
z`LAZBsMOpYDd&T(V45mF_=&Cy72W>LgTF|JPb!v7q*)n97$$4=rP-;u{oVP!{f*s~
zsl_jTasTel%0jg=IhtuZc<{uT6Osc<T7m+yHgOah$i>+EC41j4o<GO#KkU*o{2VR;
ze$EZ21cv(~>(i~DUi~-t?+r?{to%O#5b$vT;{O0EY*ZFUuB*EM9ktLwegPZNAPMAA
z$^u6iIwg+x$0qV2cZfzO{9h&!PmNV3^6~ii3|$=s*#Vr4#F8m0!^OGz#cFk`UafD{
zC-1!W`pxRp?stFY^*#Dv$2bL~(0}&g38}Q;l=y6px?xG;1Z(jqB3oaHf6mJ0$5XQT
z4g3-KPjzv`XED2iG~Frw6ZWtt{+aog>|eqkR{xd!qX1I>13rf~fS;>@68Ip0hMgCc
zQ0UTNfLD+OwL)%CfafR8DfX{=BB|UM^{+IneulqPC=&l9N8}(dRsmMFSF6+2#X0uH
zmG{2-(Tnqo@BZWmdu!D>*2lJ&mXfEQ+8JRutKFwhos`}FVv=O>U;TXisNlb18xiwI
zb=5ayb_e{NHY?lPqzb0gUnTz{|El{1|EIDgfiL)l07m|6$hiStmvjYHh%P2T7klGT
zy~6|VG)!}gH9M6HdL!vvQSc|&?*ji5<Ue!O5_<wiv51jSG?gyTt=-t1tI!Ek%~j^_
zef!fdJ^a$gAMI}~)~4A6yuP}SzaaQU-lW+<dOccXuQR>gGNd(y<;fGQRW%=JZgc4P
zQ<UCW{PA(Fyn$cXF9y(9{Z{S|{wGBG(`lvt4g3cCb<l>RQ95ln0O^BJ1OV$(G9Unu
zM(4kf<E%YDnGN_x(%B;QZ>XCs<NqZ813yiFg=91~5@I`Cesp=aHodgGvb8^+-Ff}9
zPe1?U+uyu<{jEpqYpZj!3(J#fr^F%Zg5YmP`cIuw-5yetZsz1GHvh^%2TpfW{>l;n
zNco3;Gmnkk?ZSQnzmSUlB>6L{Uq<{Z%SF%RpF^7@{#Ez{o4^;Nrj%N*0Ro@bVTbWc
ziUi8?iT!Jg^RCg!3^iZY`KL>HTuV;yC#3#4T`G=`jV7aU@KY2iE-%ppw)4sxuPh{%
z-}=^PtN}f#ukAc|d}Dc@lOZzMs9I@P8mX&4>#>Z}XU?2s-zP`<bE+|;_0ok4z23Nl
z|17t7?KC^Pt+E0r_=We_e_j5i0a~ix(*0zFztRLJZNT6s|9v_HjevR%n7m{tr2cF2
zg5tdd2n_->U33>F(t!Zgn9(Wn{&+H5u9S09|C+6o*a416B%_#SIzKt<Z2RT?x#DP{
za%1nwPyNJ?fBN<`VR>%##XD<@)%;YU2ZnG7MExYz{?g^ES2@R)v;5@9X*uq{ub0u&
z+1A=RFbsYM-5+dq@;EH404n|~>^MK2{{mmd|7!V5cmE>&97@5)IdOHm1PF`8<o)1y
z`m^Bx3<AJ?iIU_IA9cl71b>)pWOQmg8;d1#Y;u+Qm%?9y|0x<o0wZh!s;un2wLams
z<l?KZZoKqUKl4-Xzdu_nba#$Vzy9*IDhH(G<a|e}Tvs@~n1j_hR+__2IrNwzhgR$7
zxDE7nwYOXR36X!QEyHv>af1mRKlaZE{tWR~?8gIy)bgL9{=)z0NH~D}8~SGOSy{m!
zAW4HDh=R_1F%4*^?wvS|;6h(;tegtsjnX-G`++|{F{SgrRw<2_#zzyWv0^lstW_sR
zcfatZxg<TKxxFte-G1`(Kk?EBJEP~09JyM%`@;HYd1j$z>gGgePO;_i3X)c-+==u`
z|E`>6L1Vv-!;np!Z#NX6{6qaW&e4PH`?Q(`zuwj^Dj))&t3URSozU37XXa0NfACBF
zOA<ha;s10mATQ{64Y5$j1@uvA4?Nv+<}B*S{YBtXcdtD*UP^{I{SLt&qwb#|*Da$5
zMYenj{#-IWK2-?kmgc6*rH%bU*M)&&L$AKLeESPO`~CgbU-Ps$+pcDal~<N#R~I=)
zO<FgV5mReeV#NU>ft+|UXdrii=n(vrzj1#-QXCFlCI?3TrT(u<KTqXf+&?=Yb^C94
zKM8(V{|W^_ZQwWfuS}1@X;E?&dZ|DR=X*xtPX$^|7vykh&eyb9?Y`9bSUfaBCY~u5
zQ;{hAuaN)8$EIf#|MSTV<?l>-ZDnz5b8FUn=5XhQ&BwIBzxvak-@ftEb?ez<r@V_d
zsEDp@t(Do*)EH{JimcNrr_QpD#|3(e$;9!_9mv0!zf?3pr@PnAoe*5Q{)-2As{Shc
z5eX9hOZ7{<Ke8XgR3IRY_y@&Tk)R>rEP(2)rrYpTi!3^GFA-rirwXyPULC~G*;<?{
zL8rAVkQpyVLoqV(bg7VtL{p<We)vBL|LOV9l<9AqS>N5hv3=vgwTV<^Yi)P;_WhTC
z`qPKEZp~lYsn>7by1u)!c4K$Fo)b3>XTe6iN~p>PXJ$$mC*bAe|Gqv9V5c>}dKUOc
z5RU-+pEyk~MD<&(00CcOe--@e3gq&H=>8$~74R&4$)P!Xr1*c`1nLql1`S7x4LTzN
zj(l^Bte7FOLL4l{fkNs$38juoBROv-kRBV2fIk&W6i4H1`yZW{68;ybt5YQoz{sH&
zvkMC=+t;pdZtm{i*t`8=eg4jk{kyMz@7;&@9xpCE+P`^&%@8|lYumf^B8SKnM)SEG
zwSAg9aZ;4<Ia0tOCzx=`xruY^`g=NhYypm45cd}x4sgC+SC>VaE){&(FG_>H5ckR%
zARGZ8-LE=!tb@YNrYNU!`as9!5N{B0A>tfC%_b8uz9yubGqgBHMpTHQ>=Cwz&2-F0
zPTKG5?e1|#N3-EjJfDgt3;B3BPQ&v=c{C9(p#NnKfXJZ`bBl}hmHOfayMC^3tlyih
zJgBei-hS=T>dLFN>cgF-8+$i)HdmJFizUtw0euEZl?YXed7|}f-%=-8$z{22Uss<!
zDEP(vvD3vp*wt+%{1N!7@(Fsi>M7`DxIBRfd>8B@O59dPOsQQ;{6}#)x;^85Ih^W`
zR?ES8a(ayHpeBG&L(GU6V2c?6?B#U!_N!N<7vb!M!B8$03?=f3k=ST9#xi@Bjqrt}
z;(wVfaaqB?K#Sk><i^^yTYKw!_hyTaW;d3XcXqa?A5B*8-rc;;lEC_n`MFA7mUI-O
zm@ApQTEG=iRVbi5rxtVPguVf1i1kbszk;mxvi-fM*AW!CR<;5L)AK8mD$td%N<+BP
zB2e%loQHt+I!+_zNI7SU!*6xKyxge*+)rW4mE}G@a_gnOo^Cc@T)lje6DZC~8#b}{
zvE!$D{nVf$sZp}Ld^Rdwmg&(_F`Xz@s#E2Oaxt4MOo6|)yt%ZswSxfMyti7qHNL*M
zxU;xlzB{*gi$x&Rf8*x%?wa13fytBpRB>mpQ$jge#+5H6eH<WUc1OUUlH*2Q?0Yl!
z4|=Hnu>8#*KsPqQ%AR+*{eX}G874-OZD2Bx21yASir*X)t4<M@<B~*IL|qujH~|N!
zd*O9k>tzEp`N%N}RMI0Mx%rVJr~3jN@yTw#WF(r)#-o(iMzS3HMHXD;$XobN`_J4$
zec|qYeRpSTV`q2w=JoZJv75^)ODnf_D{Kb4acyU3^X8*F%X<%Mu>?*1G7{3-FMdiY
zm6F#J){97pFgS-;@Wh}cV{Eo{dIQ{C$U6K~{NXdkk^@iV1T-B!yT4Ikx!VxCA{j^)
z2b2X&1R5L|B@&_TSb&KU^%Bq=zu(@*UR#!1&ya<Zg{eVzgw$lPk`f7U7}dF}^r1zg
zslr$?9L?n8F(wsE7RR%kZLH(Zq>58?H*DV8p4xeMYioUFd3$%ev~YKMrM9+ycd>ST
zeSK?vYwzyO)#<f+8{*Vu@J-b#@LKR$C9P04HX{A~ZkyTcQuw38+$D?PKyE99pMXR{
zBX)p^p@6TPfdw`Zb+sW#-WLgho<gmVIv7~)5itN_Aa`^C8MVH6;ruyiv_8p6V;~jK
z6hrx<KSuUPsACAOTy1a1P?|i^bS$15A5Rhk<dbySjrd~)`hLd*|759<NrS&We|_(I
zZGL6z`p!yyW?`<pbbWPkYH8`(+{9e1I=8mX&gj*}t()6pg+h_R!Exhb@<g7E6<Ah>
z2>3#}d*qH917=r9x__fIJG<ek%SHQln8cs(htqI41y`bGIRc2&NI-9wBN-4Sq6-QB
zCHj#_xw8`{x9OHlA?9qklc=>wxS$5M1VMTLbdv6C)?bMPxbv6c5)mFR`wC*o(TVY7
zIGoO>I1FqgFj5%LX9Yh;UKcaeL*^HjxAv|tudK2qio+gC6Q$ZZN8!!Pt=5YZW2F)&
z16AmcV`KF6#N^ZzCk0lk)#(a<KY<4+<a6Q+DOkaLb~kh9k3nZpy5BK>9uaER_d^6U
z`nsny0Hefgd>3)|u^o<E5V8A50$^GGQ22%YA$WY6<D@Av31Z5P!e$+?1VHd!*nRrU
z`3u12J`8vVF%~!nbcvnjvdN6&FB5TUpW~@;B$dykXaERBM%nW*&YCCq3z^I~^^L{-
zYirkTZf}$6V-z^|aj~L~#9W=6ERB|?7dNl(-dI}RzgeH<kifZl`lV{r1UWyA4x(%-
zA=w)3{haS+H4kvp!CrfSV;9LQaeo*D4jY!vFTL##3rGMhh7g-UJ3Mz!vU@R?qEDhp
zD*9Cx#lVaIf12FLOK7(!w@5kRE+>h@ZMn@2Rs<6w<#jnpL>Z@Oj7=zU_V-MTrz4Rh
z?QeN}AoxeK<DB=&zW>p5rc{Oh`-s5q&F$rSb$YTeIX5#|&W`13Gn2JSu~eI#U)|f=
z+uz>2&4C#6^BfAxZ<Q&XN+clQ1v~pXVSPV02x-6CZ3`q=$P)YlUyf4>lPU;e*%c?9
z(c(raa&!9?Y@C=&;1m5S=PEHZ1l6-Ysb;D6FOhQ`S_?tYbFqNpwk#>~2b^`xo?|i3
ztRo^w7-#uJ9Nr|$JmAk|^EtMp=Eg_a)jEwNgFg%Y`OUSB&2^UjuPx84RFhL2hBKbc
z=1YZ%+;l#_xUjgjz4zeuwbku)I-!x930>+505NNDSOOODMiO~+Vx+pPes(`4I9r?S
zkHpC1XW?5apTUtN8%~M^jv}KJ1L>#?6V8zTD%LY$wf|A6e~@B^%5p^3kBe2K!a|%+
zm@kQqSP#}xSyksYCP_+1@n}4uDoE5&#UiV11URf%lCAHh?8r!}IGP<D#Z2c)1-5?)
z{!$^!Dc_X$=y+IPy|%HjurSABSTZB=IDATHier<h+9F%RukUPbZ){7*C^DZ;ip;a6
ziTDG)6ZjAGO9;Y!aTtA$0O;Z)0kqtYQ9%DCOo08d%or=}<x(LPjldx8zG6jo-G0oU
zY<UQ)LoifM6$*i#LnWxRNT@G23+(6MENmKQr_e&j!R9I%Q{qGuM+T29Mn{Cm2+;_$
z`P?WcnX!X4MgTlYzYj_V|50j%e>B5Ji2CBv_UiR(yPI2U3sw50Nt8J#EG#GEu>o*u
zd}L~Ai*}$bdSZ%5sS15L*x3mCiTa7Pxi^m7R*ED2DTUDQD*=7Phxrevj&Ff4nO~g!
zwph*ad^8x48xj#4m^lxCmS9PvFq8zsz?WD!C>bsH48^QT$pJy<&>S%xL|;)Q>ENVZ
zhO7&tDD6W4GBN^lkv1lX_u;6U5#c`}KM7DF&l$LxOn$sDT5jN18{FqMmRGLtZm!K%
za<j7(Msl%$hcyGv%qe5!3em}x&CQ+d?XA^Gv03<d6_iNM!?2Kjv8Pvdb%;rmAc$KF
z;S$1Lr)+ry7Du5H)=>F~=a~!ps*}U)`9OYL7Iix!G1zi%CX0jnH0j<y7$mtHFi~r9
z^U=r-6qFh=p{GP$Mj|d+WQ-WFA)AW9Bz=-3f*q$LtnW&o%gR@=IGWFM4mw>AtOix4
zD(HW9baHl{HLvCMrIi&r0<i4xKc5VHoSYf2@;o>hp5SPlYul^!If+>93<=;6Mzxnu
zZlsdod}UHYoFP>oIbRCJ<1;vGB}B(FtXKHMoI8O2$d)98L>xwcFSiB3c#}d}xK*Le
z6%qo7`3s`sBzKr}v>aqm0I6p{0YwBvn;~i=?*;>sQc40LAq7L8$UUxNxiJ!*VkVU=
zjLC`LqobG~IsHTM7jnfZnm?y&3rovv0GyhguTd7xkEX(Irou9e1nk83f%4+&(){8a
zx5u-~0Ul0{aG+|Ql$BHNol=>e;ha29(N$hChha1DquG&R>HUE%?EVYU=S8fPU{fd`
zg2&)s=za-qxs{rkJH@sSx;O?-;Dbl<0tt%=A&l5Tcbv-BH%dr^5ORiw$~*v02|yAA
zUQlTgt#+JXz=F<rk>Emb=dr)Cr@vI0Qv6r=XQyj(bMv+8OszII!zx))`c|y&5z4nv
zl=Df*)yC?ag*;u)k*jfhNGdzZ`Dq-Qs!oWZeSU47v9`Pj{U%sb7?HRmr0%;lLd%=r
z2LtvWgDiCeAB959CP%Gy^?_d`fO98%iB}c>h1UUIBiqA+AOeyRV9q4b5{IhrN~S0!
zNBL_#J>qrABKT!xCJ+@=1@ni^CeTS0#!27V@X4xgLHI8_LI{Ltgk}dk0S)kHXRGCG
zlyY>`?+&Cgai2=<Ii3pw)M}N9>U3WCUFBfdrDdGv*3RxVe8cq{H?E5TU0bftf<G~$
z^grm~#D@qS=YDoS%Jvr#;1Oj&oScFt_c-j~wix2cB;VsMZRm@t1c>+tRPG}bXuL1M
zvYot7{*-%XQGHgm01*=v`6%%Kvp5Ec0%XL&U&7we=Pmd-@qDz%vd36i#D7Wx0Zuq5
z73qGPng)MuwpvPu0*QQP#2?BPQhpBWK+;DN93xY%R>oz8cx81R+}Ez}?cKa}>$Z$r
zxAt%DUB9+1__3bESe!5c|2b=lu0L`7lJ;`yuo!u;N|DB{cs|5kA0#dk1t)y=ax{f5
z|KO*Vp#MWua4JI}mZRhQsBKDdQ(U3A1c`Eq_{bd?cmm1EWJoxPro~zL1%DPVQ6Ox=
z;El<_&)}cp{4>r!s&Mkn)YL?2QutqEQ(!9SPZV+^K7Wc6!vS98jFcKN;_3=rz#G^1
zZtmZ@edq4od-w0(yLb2A-8<k{@OL&b5i{&EONyWiet`Pf$bji0{3rTR@Fg}!LXl>X
z3C<#unG>=}B+0pi7&Kk|{c_(-Sybui>F(+7?(Sho9YlN~z%dL^RJhGwOS26{IhC3!
z@ajQ=0V@8Z<KX9@%LJyE4oKnuxbS}x{P|JA&%H0k$IDYQGd0;1C-tIq+-G-$vuu){
z<6Jxr1-pI&*mv*UyZ?Z;a2XHoEBtrv+}__;ykD2&(j=lvMlo1&`bC5r$I50?740LO
z5I|Kysf6nMB36l0D3gSmNLrTuX?)b9A4qkTwY1hNa<5Ry=?H#Q5P&@phD+3^0yOpK
z*&Oh70zd`ovxF&x0u*VYu@v}ee1*t)^q+N*q8tTTB74Kpjp7IiX{Sr0oPt`NfdZAX
z_@`LVVIFj+s;iWQXo9+T|KSUd9=*tT^ytwGFT4Q!2XG(kG)nJmZ>+5@EpS}c1Q*J(
zyJ#3WR`M_Bx?s+g*Fq^}bx&lJIp@S(qK@2yA&<{g(yDSlL;R7C3zFb)qd0|Jq96Pq
zfCM&!09B$M5q9V&=yOJ07NsYLmLN#7Ar5*SL+&Ns&X19@q;qt=mP&Ly3H}0YG32t7
zlGsm>*3OXHpav7^pnHfzlV)!`0QifKUy||orN@sR3w(Hw)YB(-jfxRHvJ0d{bWTW0
zluNUk>cj5}{xCAi{f9UR0;LqhGNfoQPFz(k5m`}05cuGinyN^v?#~ewzh8>mM7+uZ
z5kM2l8j2$D)lkKFRgLFI2&;0ckuZnOQkEu2<5;aA2YteSq>_a`<eweSbiJ0b;WWH5
z0}lLAPMg_qAPlsap%H$)UW##0igRS{)?@a;7~?U_2m9T-xA*sM>~3$ab2_v@*K;Do
zh7g=}E4H{fiDQ{fEE0`#{1lv~UIOHavXl=nypvQ8CQ4M_nULZ?>k1<N3co}@Vk!JX
z)Q1Rv#F<IZqs%&nLQI8J3go*Y8W&P0=BNmX;1l;qG$rN10a`DU*e|%7=TB(zA_Wlq
z6D)#Hz=Cn~z$j%fv+2t8G%avzyL+2ccs{#pq;^Zee=N|2>x${yTO5zKv{;|#%wTk9
zYC_Dbl)*&dMA_wZ^^BZ1k5YkJFjqvjgu0X+D&UtKNbw)|qPN0+R72EXKl}^)62lCO
zGbiAQNPUQ)U(`o%3#+-Bij*X_)I$P2Bas@RWFvSA3kR9z#q6Q?87v<LAfGL8<R$kz
zlQ<kb0JDTb(}=#pU!7m)p1<Qh3x_t1%-%%fUwC*Q<oh?T>*5Y_9lFGNV@%($HmZDu
zN}`M$sf<hE<3yokbIBaELQo)=)QzO@1pBeVMP`<#`Ajb&O52xkQVgZwmk^H7=Xtjd
zew15a2UM+9R#1p@i5}%1Iw}e47q=l5!@iy#vo9|9{ZJIn5VA1}#W5Tj;$LR<bG$SO
zZ}5aD{Tx%{Hdq`1b8T-qW9{S?u%VgVJ9p6Y>mu#gxTVDfup{t-TxKrz2VTGt75`HX
z#IR6B(=jhBP&!y#kWr^>QUkvdHVB}zpV<@|>My=PSAUUXSzCbr68h<}9P%%gQSuuJ
zAqYx2Hl2__v05Ojdx^vH-d-GLFJFvqv!A|}e3mIPK~6Qr;xI)}tI{3#6`SCP7(KvG
zSEr{bgKp0yOr00ccX-RoTbo$8B|#y`m8lS$1x1N3B=IEg!Jz`19Dw3jeQ|MVd5Ja}
zeejgOt&7CW!hdu?@!wEB5zMFj7yJ$D2S~YTDK1{Pk2D0NSyYdlz|Bp3;jy4p1HAH7
z5bL<*JNzFQbOe*K!-*az3=wB}jf?HY48e5C!V!OpnIe!J{e&iq{&N$|rNzmJx%1Mw
ztIq69WA?nO!f578wHN3@PwX1$JVnzvZWOY-N)o^>O2)<pxxm`W>dMM8XA27LU_ba3
z`&HfaDgQ+}<pZT|(ZH{ypR%b$#jK8#$Vs?J)ffq<#=_v7HEzKxgXe@s@*0ooi%t}7
zu0=6W!tNO99;}iu2s?npb5kRjPv|qLJdlnb6IGdHUg)bRZLN~`_Fg`Bq0Qn;m8wRe
z6HYNz<!=QzA2T)0zQIMpVaWrwceZyKQY1iR6n@}?-w++~6AC}%*3F-&zgYhqA4+m6
zift6W1wWY%!7s7CTI8YnuL?^YYVcHk>g49T^xi;&9^)IgRpM+-$$2FrMDwNKgMvxh
zcOs9JD*OoF1e3u2i~eJeu>~ZEVhaU7v>1<=TU$<DXyYc8!zt`I)5OXVWB?q7OWMb+
zAtVjhmi*_28YDp1uI>I1esn=3XnuZP*pKwf<>b=h{QhrSXTp|cwT0ooP-YBjA>@!l
zcFNS80~~-*;)sBt$RMDAq-fU9KllBt{eIGQoh^P5lRRs9*09&RcZ>dRrHrBfG(@F8
zxD9f<GGzPzm`R|6RfGIt9Y1!QJptanaq=h<ILT#t7bzz^*ONH{It*%6<|QMemgkzY
z`*eT6kICZzpylXeDTqcSPd<Qun!NbIB*KU{%sjYy`q)=T`iJnb^ViL-2xbDe?!(U4
zH^jZOOC_-XcK_|$w|jd^0Q|4Fw>P)c{|5H2`AeD2{8j%GI410ez|BhJQ1ox9B7E{y
zEB{jeFGo2|j7O^a`uh9&`}+q5%!QQ!B2WM)umJCobhM6bUD6DSnxdeWRO4&>>&rgk
z%s(-~!8e?EhUZVSf5X2F0n&z?p5W{c!W1XtOmPx$js$$dx$hHquX1C_HzQ{mD){jh
zbw1%C2Z3%-?otTsQ2@PrN8RJIAv`O%_g4HnoSNC$|M~{-S1FiK|MC+0ujq$)`LCq|
zYVs%hnft~4gWt#|$J5pL<;an*jvQsEI6{a71`xvFAO^s^$Iy3Y&tIakeB1W^6Y6m)
zG|ntW{QTY8D%xI{HNw!9qWdL&A>Y#xIP}f`x&Lq){5G_w#zz^AfB;F1sDOU@;oR8^
zoRod-#^l2Dmus)KUcXV@f4)07_`o`NFM8o)0SMr;`pf<iKlopAs=^D^FZ-i@=&zC=
zGp{cACw;*D3G2{5`yv(Mz|S}_BKYd)(XS%|EI|0jPMkb-hWi6JFPiSt)XXdiD@QlL
zKijXj@xU<9*5<|rM;a_Y{lf%<8iS{NY*f*B9b-R)s}CNdghvziAN-EH(+6W8_})W~
zM2>gP(49pAoLO`G?&R}rD?jSpk01Y{`1{EEZ~*?M{=&bvll}+h4}SQ+;m7+s_@s&<
zS93p;f4x8EZ}QXJKe<oftNzPheEEg=5kTR0@`Iam$gS`2$kLQO8(3(a!-_W{qOEQA
zuEie<K>e#L&zBha1^lF`IrvF?JOYYYSKh+{9szoiSg1v^C4ewt4MN${EDuGnsX!;;
z;nEJZ-=|L$J%3a4v4UUtDBs4%{{9|I_<H{~)+hc_zvO4O_{_t_)jrlQ`z7}yDVO~l
z{Ak|<C)V$|kmA<>j__RR0QY5cJu+qWopH>4c8;Fq3YRvlwTNo~Vr_3j!2AZAWNRzW
zmlx+|euudb0WCeUf4kXS0(AGc`_rm_#^m+kW3N7j(#iZ5m*sTg*y@w}V>En77?yT;
z1Ud!pr@uS&=wI#Y`=F2Y@33C8w%j9t{yO-b_lNw${lYi*Mg5tjLH$%SN$1VyfnV%*
z`RAWG>F7(gHn}>AyK{c^$ZnocfQd-4mS4PFfn3B-%E5wZS0QEW59lOp+~&dlHtj9X
zcMJYevOh|$sfXP7b<d8l{7*bHnn@jN_VOS%T>Sn1tEU#K)3YnPz(@To+8!}}t>2zc
zbdT@j{H1->ukjoGKa*eQ{WyPzzf%9OA8BOBFBN-N`Cs9oXuF*K$GJ7h+lpu!PfpXY
zoTHLhAe5R`zd-%$Sfl@S*YeyS&sNq~rCYL8ZgZtF;l4vQym5d1U;Uf^PzjT6aQ6*$
z8QCr#L_9<1yZd&vXzyU_|1b!#IQMjZdE@P0(Y>Z`$3xqX^B;x}`NjBq&VShY<-T6(
zQ?YM`{JTf~;!o_iV^QiS^|6s}FZ4Rc0uat0Swlk2PwlZUK4To8vI1&-v9gMwiL*>@
zKe)w7-lS@5O+tCJgYKEHjQ$3z1^+nsN&eCSemDhba^i!)x77Lmy8{|Lws?5s<?l}i
zw6qw4)9*|B&GY5{X#d?lO+UF`^!h%zAL?(ppWL6+FYwbMK>hBx$$rzUs{8_f4D*Nk
ztA<bK!uIHe^LQX#nJESkR2CkJO0WR;-=P%?i|1w+9N#=Q%Oj37Ejiz3)D9>$3E#+l
zZ<IaJKWz`HU*IP*TI5Xc4RmfRlTIeAu9^$H@A$^Y050pX%JXUdiT>c-`$l{%@^#b~
z>rd*7POwg6ACXV>n~DkFs7ve%`J4LCyJ_k-^}R`#*j>9cxzC=y2Ro(2z-=)AGjoe+
z?xe>Cl2N#aZ|cef_YbEoUZp2^hvS|nSSioQDS5Jo9D@%`a^A6NE7rcL&M|jFxn|vT
z4DdkrrEmkihd#6~cq8BYj`=F@@wo70{)a;!q2I(;>lc5^d<=e#{76gme(<Ay8y}#z
zdE+IysqaQ8*m*#oQUkTc1Nb47X;zT@nBMFKS;Os{95TlB;aqBr0r2A}$oUVW4s7~j
zKn9FwdP|fpay;2+?rJDM{evJH@|e8f<Muc{@IM@M@S#49ALdKkYwAzp6Yfjqlgw|K
ztPJ=NQW8IY?Na{~v9B`Ug0J(_tz<ut>p^oHGzI+#69_EamIVc{iH+;v-6YI<j01&k
z2DAA;Hg@gIFpcqZ-XcUI#Y6@bGvg3rlN@tNt>WQU?pWyQ0FlT<6?lTg3bX^gJt`0K
zX7+q-TK&TB(jMcU=*jt$^(OBz@WaODy-9pq-4pXC$u#s=(K+=w^<j)QO+yTyMvjvN
zv~ielvyTCRk=2vdM_7(nQ6t!2yyCh>Lp0FG8TGhz<s3y27i%ya-~l_{@9>CI9e_@C
zy2WsBa&~tuaANU*JwhlkXa|r$>AjR*=v92JX8SNd4S%%1ySukT*7rv0GwyDZa+>&r
zkGr-<vwl+_(KqX#lKd+50aB@+xiPi`+bn_IjgevU0;GdA+f>cLk?us&6yh97w|fkl
zbPLIu*kixIR0#V-|FQG<J4z=@=t8=AyBk#~PY{8l1+Ze;7y_aFQ_MlK)7p_nzCg-j
z1-#JtLE-(*^kshLBYVbg<VX9&rg(oNKQmvOzL1rPznP!p#}EnnyyigUkM056#Iv6z
z`jL@gT%rg;$*=`RBb<{%b4^l}iB}ywWe=U1i;HyJ?~F3qLkUPs*VErnpzh(N`|Q%O
z-VxrL0Ad7X(_sA|()`s1c(0K2!9b7M_YB{ZFP>jPza_m@^qBVqzGaVPZ&-TX%oq4N
zKk%#UCqi>Ar!Jgn(CY5Q^MxP$Cr>g?Prv8%X-2)dF(s!aUe^%#mY?kZ`S~;7J6l<)
zAw8VOGYhK}snkKOEkS9D1k}F<P&S|!lobFlS_WyHTzd#=Uh@@vVX2;k*Pj9J#q(Re
zr|nnRPu`cRC%&K1m&_;JU-(<-cVH0kcOfY!Niqp+bL(8@=QBry@!Q~HyYKY(LT8Wj
zDru2B8uY==o@LOA)(j8U-DD%)zQ!!M`-`>p)fe1fV;NAp(j=yq<_N%F+nW7PEI=pt
zaPU!FrQoLLl0{-X2Jff*lEgbDhXC_qBtEPCH=XNKcO>nR`kM7u(<{_Z&$AL=xu3{S
z?^N(U-$EOgNoFBw`|N3}?`WUSHGRFa(wW=;&J`LGM}~)nc;tbk`0_PdTn~s1mO}R6
zR?UJpQV5dMHT*b$3j+W?0^m<<092w^yMY<7zPJR<O9X|-xXSU(_ktecYv|MP=#qY{
zpQcNk-^|CNzskNRmVGnaKT(roTsaPhli;^$cP42xeSIJJQvdKUNQbhU!9xtR{?hUo
z8a{L0NhxYTc30T0qWuVVD}k3(0+xc-fuBoZ5>r^(eodTUKPDU+0Fy9X@0*@`IxpaS
zLcUP*icQ<s@o}eKxl^`py;*^;eBZne?yH`UnV*TT`HEXF_&m>yz-pq8?y+ZW-=N>q
zKHNt;_uCVL{e67{1IO3{{qD>Ks^6>Exy>7UpD`1**S=&Wpa5R>DnaV6NPu7N4N5`)
zh(H>Y1$YySz=Q@;R)i40$V)z!PxA$U2fgr7eV6jIa;L&xgRk{<fj>!kC4LJ}=S|lH
z-v|%zRo>Xv)|)YXjo;lHRyJ<vV-GP%>6GWlDZKy!U~rTRyXka2`7h|(rV{uq0%8E@
zKecefPcuXbWHzOopt*snEG`tB0*EN`UOBtCg&y<)U!@LQl9UJQ2YbrBbUTyv$o##$
zCwXtmJ}NTJo;x^=^LZN1-5V!BK+hx<*d3ya?Q8w$UhV__H%4>@=&XOKgs9#hlLd2Y
zjfEhka2Y@<A^J?&eNY0gsuuzWN&>L~y}%VCkO-uLvV_16T4hm?Rd^Jyq-4%2^!y3o
z{ScoqU*4;FzxrJkqOABn<Tr11LTC9tqlsoP;qc4xd(ra@MA<&NFZd?Cv3=0f;Ad)>
zC-cWCV?l2kpE8IRUACzez!EMH56}gmz%|~o1G7mCq#>9vs23cTfNU5~2s~}!-%I$&
zM+pVIcAp+sXy0Ntj&IImvj=%s+_&3H{{icl`f5Jh%_n!K?P{6Kbm%WR_0HsP!d_Fp
zn|Nv;*LSny#0jUEC}zhWa!TI>b&!QH2Eb<R9E)DSqIF=81F!%{Ff7>GVVEeDnMQyF
z`0ag@!N3z4fR8`_cg0Gg_{Mrsy`?_+J_(=N5ARpS!@rC513wLzMGA5{%AL#8)APC$
za<x3}8`DqFyKtT<u*&#*%#ZKaUJv-$5<>r6;5RWrzU<ar2|yQ^o11rESqW4(1Ospk
zk2q)^4hyjKfs%k0kfjwP0ipSpxQGlCgX~!U_2ac{aqtcGy`qorH`@>I+kfgYKi#J^
z-U6OZs{;Y1=S}%|zUUnw)%fv!C)vV$$@t6l^?lTToD|<mHV%LUrdlV2+X^E95=$+B
zARr4S2oQY{0lyG3*d!9#6VKW<VgsUpR6M{8Sip=J+N=LtlF27|QC^ffyL`Lor9MsH
zRlbkz!+kVdpSd$0mmi1kYCZw*dcHk9)7H>_d4ACMjGw5Vum|<AyXVf=9mn@6ZpW((
zv=a~mn8`c^-3$a^JFIg7dZI}L*3%SyWmnv;a7jQP2oI=AF_{@_2$4ZSj;ZHG@c9C<
z+un@j2Ytbp`s}+_yN9g?CSmNjvZYqlIsL3|kPFAh^x?gq-OIgnDPx-#qJ5_riTmaH
z@qOWgUi`P1c=^qde#VgD02mMfJhl^JGsuzKd742A0103Lm;f`(l0Xaq?U)YAgSvq{
z5Cf(xiUgE`I2$8$@AQBNIzlMus=k!pv|q|!XP<$6Htvow%ER?k<J}n7O)WpBU+|mu
zF@86*lkk@IGrG`=d%+L<K`w+v05||wxY-B50}O)hT@eHfgJl6o02gQyFb*^c@EgVg
zN&w{v*g&M9R&|D0g0|2MLUq?FDQPYGPJu^?F@6La?UV7c$r8FVy=%MJ*zITC8}!h<
zb1G%~@qL&t_;LMH1pQMZ7(Wl|!hAYEzJH8eUq(S0`4K-9XeUSk=mGBCkWt41FaUi3
z{02e@c*Qs%3BZEX1N^EIAvO>{NH~ZYh|JZK!|txBZ3!SBR|2eRa<rgV%oq6Ndt48?
zFvro6lytUdXjR<319}V}+AG~B=#}R;<0tGn*Lo=9@4`OtgTCMovi+7e$Cw@6vH*Nw
z`msJR%M=+VKqd$WbR^Jn!BumiE)Ze?tcnHk12q{N&<+Zpuh@gH{xwvV%g6Nna?zuD
zR+x`H7DwL}oC9K{)akX+p!boZ*2njOA45JdzY!mvf5`PEewn|jeW{O$4s2iTbAkvD
zI^N($Y@hQObzqYDA}jy{kOXsQEBF-v9FRDm4-^ToHI#B_jnHIzf(=*_{L2ht((-xP
zf}-$Ez<>|#d`9p@P>d(w`59z8Y^)mPoQH8PVw@H(k&15A?PmBHbp?E7JTPBO|3`wp
zo{#P=c4fZPjryd1D!#yX<Y55#;OC)X1TdQU$$JmrK;Tcgjg>qI28aOW+4QH;L>zQu
zOd4!)@Q@M247SY*S>^;REdhM2dJSMCVr`$dd_G_6cZr>uN5<c@d9Y*iU<ZAvK5m~K
z%mn;6z6oD*eRXftx7rauv=4rmAL}1xz6JE(eyi@C_ZaL4|MeT58KMA286brO%K*gB
zPQF`0EL!66fozPmRLIRi3qtX;T1Nbt3z0zBpzsHL_pbU@9&I6`vivB1mHUtv400Kx
z%*<jldsL6xm*0!tZcT#TA=^KMK3R|I4|=*CT7Lz;FrV7*`}$Ys+6jWBL3Fq?(8G2>
z_5(tKgu!fz-~(m?6$dLB)CO$EbP_NW;06*wJVAXHBZ5z?z9v%?t{-X0%kJ4vb~io#
z?z05$TXaJ^n7x!2^fy>d`Jm4xra=$uNBb8!9N>bsUpXJ>%lEN;iQllN_8s&Y80hQk
G`}%(cy4YR-

literal 0
HcmV?d00001

diff --git a/benchmarks/new_opencl/convolution/kernel.cl b/benchmarks/new_opencl/convolution/kernel.cl
new file mode 100755
index 00000000..ff56dc6f
--- /dev/null
+++ b/benchmarks/new_opencl/convolution/kernel.cl
@@ -0,0 +1,54 @@
+__kernel
+void convolution(
+   __read_only  image2d_t  sourceImage,
+   __write_only image2d_t  outputImage, 
+   int rows,
+   int cols,
+   __constant float* filter, 
+   int filterWidth,
+   sampler_t sampler) 
+{
+   // Store each work-item’s unique row and column
+   int column = get_global_id(0);
+   int row    = get_global_id(1);
+
+   // Half the width of the filter is needed for indexing 
+   // memory later
+   int halfWidth = (int)(filterWidth/2);
+
+   // All accesses to images return data as four-element vector 
+   // (i.e., float4), although only the 'x' component will contain 
+   // meaningful data in this code
+   float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
+    
+   // Iterator for the filter
+   int filterIdx = 0;
+    
+   // Each work-item iterates around its local area based on the 
+   // size of the filter
+   int2 coords;  // Coordinates for accessing the image
+   // Iterate the filter rows
+   for(int i = -halfWidth; i <= halfWidth; i++) {
+      coords.y = row + i;
+
+      // Iterate over the filter columns
+      for(int j = -halfWidth; j <= halfWidth; j++) {
+         coords.x = column + j;
+
+         float4 pixel;
+         // Read a pixel from the image.  A single channel image 
+         // stores the pixel in the 'x' coordinate of the returned
+         // vector.
+         pixel = read_imagef(sourceImage, sampler, coords);
+         sum.x += pixel.x * filter[filterIdx++];
+      }
+   }
+    
+   // Copy the data to the output image if the
+   // work-item is in bounds
+   if(row < rows && column < cols) {
+      coords.x = column;
+      coords.y = row;
+      write_imagef(outputImage, coords, sum);
+   } 
+}
\ No newline at end of file
diff --git a/benchmarks/new_opencl/convolution/main.cpp b/benchmarks/new_opencl/convolution/main.cpp
new file mode 100755
index 00000000..f5bf1584
--- /dev/null
+++ b/benchmarks/new_opencl/convolution/main.cpp
@@ -0,0 +1,261 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <CL/cl.h>
+
+#include "utils.h"
+
+// This function takes a positive integer and rounds it up to
+// the nearest multiple of another provided integer
+unsigned int roundUp(unsigned int value, unsigned int multiple) {
+
+  // Determine how far past the nearest multiple the value is
+  unsigned int remainder = value % multiple;
+
+  // Add the difference to make the value a multiple
+  if(remainder != 0) {
+          value += (multiple-remainder);
+  }
+
+  return value;
+}
+
+// This function reads in a text file and stores it as a char pointer
+char* readSource(char* kernelPath) {
+
+   cl_int status;
+   FILE *fp;
+   char *source;
+   long int size;
+
+   printf("Program file is: %s\n", kernelPath);
+
+   fp = fopen(kernelPath, "rb");
+   if(!fp) {
+      printf("Could not open kernel file\n");
+      exit(-1);
+   }
+   status = fseek(fp, 0, SEEK_END);
+   if(status != 0) {
+      printf("Error seeking to end of file\n");
+      exit(-1);
+   }
+   size = ftell(fp);
+   if(size < 0) {
+      printf("Error getting file position\n");
+      exit(-1);
+   }
+
+   rewind(fp);
+
+   source = (char *)malloc(size + 1);
+
+   int i;
+   for (i = 0; i < size+1; i++) {
+      source[i]='\0';
+   }
+
+   if(source == NULL) {
+      printf("Error allocating space for the kernel source\n");
+      exit(-1);
+   }
+
+   fread(source, 1, size, fp);
+   source[size] = '\0';
+
+   return source;
+}
+
+void chk(cl_int status, const char* cmd) {
+
+   if(status != CL_SUCCESS) {
+      printf("%s failed (%d)\n", cmd, status);
+      exit(-1);
+   }
+}
+
+int main() {
+
+   int i, j, k, l;
+
+   // Rows and columns in the input image
+   int imageHeight;
+   int imageWidth;
+
+   const char* inputFile = "input.bmp";
+   const char* outputFile = "output.bmp";
+
+   // Homegrown function to read a BMP from file
+   float* inputImage = readImage(inputFile, &imageWidth,
+      &imageHeight);
+
+   // Size of the input and output images on the host
+   int dataSize = imageHeight*imageWidth*sizeof(float);
+
+   // Output image on the host
+   float* outputImage = NULL;
+   outputImage = (float*)malloc(dataSize);
+   float* refImage = NULL;
+   refImage = (float*)malloc(dataSize);
+
+   // 45 degree motion blur
+   float filter[49] =
+      {0,      0,      0,      0,      0,      0,      0,
+       0,      0,      0,      0,      0,      0,      0,
+       0,      0,     -1,      0,      1,      0,      0,
+       0,      0,     -2,      0,      2,      0,      0,
+       0,      0,     -1,      0,      1,      0,      0,
+       0,      0,      0,      0,      0,      0,      0,
+       0,      0,      0,      0,      0,      0,      0};
+
+   // The convolution filter is 7x7
+   int filterWidth = 7;  
+   int filterSize  = filterWidth*filterWidth;  // Assume a square kernel
+
+   // Set up the OpenCL environment
+   cl_int status;
+
+   // Discovery platform
+   cl_platform_id platform;
+   status = clGetPlatformIDs(1, &platform, NULL);
+   chk(status, "clGetPlatformIDs");
+
+   // Discover device
+   cl_device_id device;
+   clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
+   chk(status, "clGetDeviceIDs");
+
+   // Create context
+   cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
+       (cl_context_properties)(platform), 0};
+   cl_context context;
+   context = clCreateContext(props, 1, &device, NULL, NULL, &status);
+   chk(status, "clCreateContext");
+
+   // Create command queue
+   cl_command_queue queue;
+   queue = clCreateCommandQueue(context, device, 0, &status);
+   chk(status, "clCreateCommandQueue");
+
+   // The image format describes how the data will be stored in memory
+   cl_image_format format;
+   format.image_channel_order     = CL_R;     // single channel
+   format.image_channel_data_type = CL_FLOAT; // float data type
+
+   // Create space for the source image on the device
+   cl_mem d_inputImage = clCreateImage2D(context, 0, &format, imageWidth, 
+      imageHeight, 0, NULL, &status);
+   chk(status, "clCreateImage2D");
+
+   // Create space for the output image on the device
+   cl_mem d_outputImage = clCreateImage2D(context, 0, &format, imageWidth, 
+      imageHeight, 0, NULL, &status);
+   chk(status, "clCreateImage2D");
+
+   // Create space for the 7x7 filter on the device
+   cl_mem d_filter = clCreateBuffer(context, 0, filterSize*sizeof(float), 
+      NULL, &status);
+   chk(status, "clCreateBuffer");
+
+   // Copy the source image to the device
+   size_t origin[3] = {0, 0, 0};  // Offset within the image to copy from
+   size_t region[3] = {imageWidth, imageHeight, 1}; // Elements to per dimension
+   status = clEnqueueWriteImage(queue, d_inputImage, CL_FALSE, origin, region, 
+      0, 0, inputImage, 0, NULL, NULL);
+   chk(status, "clEnqueueWriteImage");
+    
+   // Copy the 7x7 filter to the device
+   status = clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0, 
+      filterSize*sizeof(float), filter, 0, NULL, NULL);
+   chk(status, "clEnqueueWriteBuffer");
+
+   // Create the image sampler
+   cl_sampler sampler = clCreateSampler(context, CL_FALSE, 
+      CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);
+   chk(status, "clCreateSampler");
+
+   const char* source = readSource("kernel.cl");
+
+   // Create a program object with source and build it
+   cl_program program;
+   program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
+   chk(status, "clCreateProgramWithSource");
+   status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
+   chk(status, "clBuildProgram");
+ 
+   // Create the kernel object
+   cl_kernel kernel;
+   kernel = clCreateKernel(program, "convolution", &status);
+   chk(status, "clCreateKernel");
+
+   // Set the kernel arguments
+   status  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
+   status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
+   status |= clSetKernelArg(kernel, 2, sizeof(int), &imageHeight);
+   status |= clSetKernelArg(kernel, 3, sizeof(int), &imageWidth);
+   status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
+   status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
+   status |= clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler);
+   chk(status, "clSetKernelArg");
+
+   // Set the work item dimensions
+   size_t globalSize[2] = {imageWidth, imageHeight};
+   status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
+      NULL, NULL);
+   chk(status, "clEnqueueNDRange");
+
+   // Read the image back to the host
+   status = clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin, 
+      region, 0, 0, outputImage, 0, NULL, NULL); 
+   chk(status, "clEnqueueReadImage");
+
+   // Write the output image to file
+   storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);
+
+   // Compute the reference image
+   for(i = 0; i < imageHeight; i++) {
+      for(j = 0; j < imageWidth; j++) {
+         refImage[i*imageWidth+j] = 0;
+      }
+   }
+
+   // Iterate over the rows of the source image
+   int halfFilterWidth = filterWidth/2;
+   float sum;
+   for(i = 0; i < imageHeight; i++) {
+      // Iterate over the columns of the source image
+      for(j = 0; j < imageWidth; j++) {
+         sum = 0; // Reset sum for new source pixel
+         // Apply the filter to the neighborhood
+         for(k = - halfFilterWidth; k <= halfFilterWidth; k++) {
+            for(l = - halfFilterWidth; l <= halfFilterWidth; l++) {
+               if(i+k >= 0 && i+k < imageHeight && 
+                  j+l >= 0 && j+l < imageWidth) {
+                  sum += inputImage[(i+k)*imageWidth + j+l] * 
+                         filter[(k+halfFilterWidth)*filterWidth + 
+                            l+halfFilterWidth];
+               }
+            }
+         }
+         refImage[i*imageWidth+j] = sum;
+      }
+   }
+
+   int failed = 0;
+   for(i = 0; i < imageHeight; i++) {
+      for(j = 0; j < imageWidth; j++) {
+         if(abs(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j]) > 0.01) {
+            printf("Results are INCORRECT\n");
+            printf("Pixel mismatch at <%d,%d> (%f vs. %f)\n", i, j,
+               outputImage[i*imageWidth+j], refImage[i*imageWidth+j]);
+            failed = 1;
+         }
+         if(failed) break;
+      }
+      if(failed) break;
+   }
+   if(!failed) {
+      printf("Results are correct\n");
+   }
+             
+   return 0;
+}
\ No newline at end of file
diff --git a/benchmarks/new_opencl/convolution/utils.cpp b/benchmarks/new_opencl/convolution/utils.cpp
new file mode 100644
index 00000000..74ca6dad
--- /dev/null
+++ b/benchmarks/new_opencl/convolution/utils.cpp
@@ -0,0 +1,180 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "utils.h"
+
+void storeImage(float *imageOut, 
+                const char *filename, 
+                int rows, 
+                int cols, 
+                const char* refFilename) {
+
+   FILE *ifp, *ofp;
+   unsigned char tmp;
+   int offset;
+   unsigned char *buffer;
+   int i, j;
+
+   int bytes;
+
+   int height, width;
+
+   ifp = fopen(refFilename, "rb");
+   if(ifp == NULL) {
+      perror(filename);
+      exit(-1);
+   }
+
+   fseek(ifp, 10, SEEK_SET);
+   fread(&offset, 4, 1, ifp);
+
+   fseek(ifp, 18, SEEK_SET);
+   fread(&width, 4, 1, ifp);
+   fread(&height, 4, 1, ifp);
+
+   fseek(ifp, 0, SEEK_SET);
+
+   buffer = (unsigned char *)malloc(offset);
+   if(buffer == NULL) {
+      perror("malloc");
+      exit(-1);
+   }
+
+   fread(buffer, 1, offset, ifp);
+
+   printf("Writing output image to %s\n", filename);
+   ofp = fopen(filename, "wb");
+   if(ofp == NULL) {
+      perror("opening output file");
+      exit(-1);
+   }
+   bytes = fwrite(buffer, 1, offset, ofp);
+   if(bytes != offset) {
+      printf("error writing header!\n");
+      exit(-1);
+   }
+
+   // NOTE bmp formats store data in reverse raster order (see comment in
+   // readImage function), so we need to flip it upside down here.  
+   int mod = width % 4;
+   if(mod != 0) {
+      mod = 4 - mod;
+   }
+   //   printf("mod = %d\n", mod);
+   for(i = height-1; i >= 0; i--) {
+      for(j = 0; j < width; j++) {
+         tmp = (unsigned char)imageOut[i*cols+j];
+         fwrite(&tmp, sizeof(char), 1, ofp);
+      }
+      // In bmp format, rows must be a multiple of 4-bytes.  
+      // So if we're not at a multiple of 4, add junk padding.
+      for(j = 0; j < mod; j++) {
+         fwrite(&tmp, sizeof(char), 1, ofp);
+      }
+   } 
+
+   fclose(ofp);
+   fclose(ifp);
+
+   free(buffer);
+}
+
+/*
+ * Read bmp image and convert to byte array. Also output the width and height
+ */
+float* readImage(const char *filename, int* widthOut, int* heightOut) {
+
+   uchar* imageData;
+
+   int height, width;
+   uchar tmp;
+   int offset;
+   int i, j;
+
+   printf("Reading input image from %s\n", filename);
+   FILE *fp = fopen(filename, "rb");
+   if(fp == NULL) {
+       perror(filename);
+       exit(-1);
+   }
+
+   fseek(fp, 10, SEEK_SET);
+   fread(&offset, 4, 1, fp);
+
+   fseek(fp, 18, SEEK_SET);
+   fread(&width, 4, 1, fp);
+   fread(&height, 4, 1, fp);
+
+   printf("width = %d\n", width);
+   printf("height = %d\n", height);
+
+   *widthOut = width;
+   *heightOut = height;    
+
+   imageData = (uchar*)malloc(width*height);
+   if(imageData == NULL) {
+       perror("malloc");
+       exit(-1);
+   }
+
+   fseek(fp, offset, SEEK_SET);
+   fflush(NULL);
+
+   int mod = width % 4;
+   if(mod != 0) {
+       mod = 4 - mod;
+   }
+
+   // NOTE bitmaps are stored in upside-down raster order.  So we begin
+   // reading from the bottom left pixel, then going from left-to-right, 
+   // read from the bottom to the top of the image.  For image analysis, 
+   // we want the image to be right-side up, so we'll modify it here.
+
+   // First we read the image in upside-down
+
+   // Read in the actual image
+   for(i = 0; i < height; i++) {
+
+      // add actual data to the image
+      for(j = 0; j < width; j++) {
+         fread(&tmp, sizeof(char), 1, fp);
+         imageData[i*width + j] = tmp;
+      }
+      // For the bmp format, each row has to be a multiple of 4, 
+      // so I need to read in the junk data and throw it away
+      for(j = 0; j < mod; j++) {
+         fread(&tmp, sizeof(char), 1, fp);
+      }
+   }
+
+   // Then we flip it over
+   int flipRow;
+   for(i = 0; i < height/2; i++) {
+      flipRow = height - (i+1);
+      for(j = 0; j < width; j++) {
+         tmp = imageData[i*width+j];
+         imageData[i*width+j] = imageData[flipRow*width+j];
+         imageData[flipRow*width+j] = tmp;
+      }
+   }
+
+   fclose(fp);
+
+   // Input image on the host
+   float* floatImage = NULL;
+   floatImage = (float*)malloc(sizeof(float)*width*height);
+   if(floatImage == NULL) {
+      perror("malloc");
+      exit(-1);
+   }
+
+   // Convert the BMP image to float (not required)
+   for(i = 0; i < height; i++) {
+      for(j = 0; j < width; j++) {
+         floatImage[i*width+j] = (float)imageData[i*width+j];
+      }
+   }
+
+   free(imageData);
+   return floatImage;
+}
\ No newline at end of file
diff --git a/benchmarks/new_opencl/convolution/utils.h b/benchmarks/new_opencl/convolution/utils.h
new file mode 100644
index 00000000..2686de50
--- /dev/null
+++ b/benchmarks/new_opencl/convolution/utils.h
@@ -0,0 +1,11 @@
+#ifndef __UTILS__
+#define __UTILS__
+
+typedef unsigned char uchar;
+
+float* readImage(const char *filename, int* widthOut, int* heightOut);
+
+void storeImage(float *imageOut, const char *filename, int rows, int cols, 
+                const char* refFilename);
+
+#endif
\ No newline at end of file
diff --git a/benchmarks/new_opencl/transpose/.gitignore b/benchmarks/new_opencl/transpose/.gitignore
new file mode 100644
index 00000000..dd07f846
--- /dev/null
+++ b/benchmarks/new_opencl/transpose/.gitignore
@@ -0,0 +1 @@
+transpose
\ No newline at end of file
diff --git a/benchmarks/new_opencl/transpose/Makefile b/benchmarks/new_opencl/transpose/Makefile
new file mode 100644
index 00000000..4092ed9f
--- /dev/null
+++ b/benchmarks/new_opencl/transpose/Makefile
@@ -0,0 +1,44 @@
+RISCV_TOOL_PATH ?= $(wildcard ~/dev/riscv-gnu-toolchain/drops)
+POCLCC_PATH ?= $(wildcard ~/dev/pocl/drops_vortex_cc)
+POCLRT_PATH ?= $(wildcard ..)
+DRIVER_PATH ?= $(wildcard ../../../driver/sw)
+
+CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-errors
+
+CXXFLAGS += -I$(POCLRT_PATH)/include
+
+LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex
+
+PROJECT = transpose
+
+SRCS = main.cc transpose_gold.cpp
+
+all: $(PROJECT)
+
+kernel.pocl: kernel.cl
+	POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCLCC_PATH)/lib:$(DRIVER_PATH)/simx $(POCLCC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
+ 
+$(PROJECT): $(SRCS)
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+
+run-fpga: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
+
+run-ase: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
+
+run-simx: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
+
+run-rtlsim: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
+
+.depend: $(SRCS)
+	$(CXX) $(CXXFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf $(PROJECT) *.o *.dump .depend 
+
+ifneq ($(MAKECMDGOALS),clean)
+    -include .depend
+endif
\ No newline at end of file
diff --git a/benchmarks/new_opencl/transpose/main.cc b/benchmarks/new_opencl/transpose/main.cc
new file mode 100644
index 00000000..f72cb851
--- /dev/null
+++ b/benchmarks/new_opencl/transpose/main.cc
@@ -0,0 +1,387 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+/* Matrix transpose with Cuda 
+ * Host code.
+
+ * This example transposes arbitrary-size matrices.  It compares a naive
+ * transpose kernel that suffers from non-coalesced writes, to an optimized
+ * transpose with fully coalesced memory access and no bank conflicts.  On 
+ * a G80 GPU, the optimized transpose can be more than 10x faster for large
+ * matrices.
+ */
+
+// standard utility and system includes
+#include "oclUtils.h"
+#include "shrQATest.h"
+
+#define BLOCK_DIM 16
+
+// max GPU's to manage for multi-GPU parallel compute
+const unsigned int MAX_GPU_COUNT = 8;
+
+// global variables
+cl_platform_id cpPlatform;
+cl_uint uiNumDevices;
+cl_device_id* cdDevices;
+cl_context cxGPUContext;
+cl_kernel ckKernel[MAX_GPU_COUNT];
+cl_command_queue commandQueue[MAX_GPU_COUNT];
+cl_program rv_program;
+
+// forward declarations
+// *********************************************************************
+int runTest( int argc, const char** argv);
+extern "C" void computeGold( float* reference, float* idata, 
+                         const unsigned int size_x, const unsigned int size_y );
+
+// Main Program
+// *********************************************************************
+int main( int argc, const char** argv) 
+{    
+    shrQAStart(argc, (char **)argv);
+
+    // set logfile name and start logs
+    shrSetLogFileName ("oclTranspose.txt");
+    shrLog("%s Starting...\n\n", argv[0]); 
+
+    // run the main test
+    int result = runTest(argc, argv);
+    //oclCheckError(result, 0);
+}
+
+double transposeGPU(const char* kernelName, bool useLocalMem,  cl_uint ciDeviceCount, float* h_idata, float* h_odata, unsigned int size_x, unsigned int size_y)
+{
+    cl_mem d_odata[MAX_GPU_COUNT];
+    cl_mem d_idata[MAX_GPU_COUNT];
+    cl_kernel ckKernel[MAX_GPU_COUNT];
+
+    size_t szGlobalWorkSize[2];
+    size_t szLocalWorkSize[2];
+    cl_int ciErrNum;
+ 
+    // Create buffers for each GPU
+    // Each GPU will compute sizePerGPU rows of the result
+    size_t sizePerGPU = shrRoundUp(BLOCK_DIM, (size_x+ciDeviceCount-1) / ciDeviceCount);
+    
+    // size of memory required to store the matrix
+    const size_t mem_size = sizeof(float) * size_x * size_y;
+
+    for(unsigned int i = 0; i < ciDeviceCount; ++i){
+        // allocate device memory and copy host to device memory
+        d_idata[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                    mem_size, h_idata, &ciErrNum);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+
+        // create buffer to store output
+        d_odata[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY ,
+                                    sizePerGPU*size_y*sizeof(float), NULL, &ciErrNum);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+
+        // create the naive transpose kernel
+        ckKernel[i] = clCreateKernel(rv_program, kernelName, &ciErrNum);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+        
+        // set the args values for the naive kernel
+        size_t offset = i * sizePerGPU;
+        ciErrNum  = clSetKernelArg(ckKernel[i], 0, sizeof(cl_mem), (void *) &d_odata[i]);
+        ciErrNum |= clSetKernelArg(ckKernel[i], 1, sizeof(cl_mem), (void *) &d_idata[0]);
+        ciErrNum |= clSetKernelArg(ckKernel[i], 2, sizeof(int), &offset);
+        ciErrNum |= clSetKernelArg(ckKernel[i], 3, sizeof(int), &size_x);
+        ciErrNum |= clSetKernelArg(ckKernel[i], 4, sizeof(int), &size_y);
+        if(useLocalMem)
+        {
+            ciErrNum |= clSetKernelArg(ckKernel[i], 5, (BLOCK_DIM + 1) * BLOCK_DIM * sizeof(float), 0 );
+        }
+    }
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    // set up execution configuration
+    szLocalWorkSize[0] = BLOCK_DIM;
+    szLocalWorkSize[1] = BLOCK_DIM;
+    szGlobalWorkSize[0] = sizePerGPU;
+    szGlobalWorkSize[1] = shrRoundUp(BLOCK_DIM, size_y);
+    
+    // execute the kernel numIterations times
+    int numIterations = 100;
+    shrLog("\nProcessing a %d by %d matrix of floats...\n\n", size_x, size_y);
+    for (int i = -1; i < numIterations; ++i)
+    {
+        // Start time measurement after warmup
+        if( i == 0 ) shrDeltaT(0);
+
+        for(unsigned int k=0; k < ciDeviceCount; ++k){
+            ciErrNum |= clEnqueueNDRangeKernel(commandQueue[k], ckKernel[k], 2, NULL,                                           
+                                szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
+        }
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+    }    
+
+    // Block CPU till GPU is done
+    for(unsigned int k=0; k < ciDeviceCount; ++k){ 
+        ciErrNum |= clFinish(commandQueue[k]);
+    }
+    double time = shrDeltaT(0)/(double)numIterations;
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    // Copy back to host
+    for(unsigned int i = 0; i < ciDeviceCount; ++i){
+        size_t offset = i * sizePerGPU;
+        size_t size = MIN(size_x - i * sizePerGPU, sizePerGPU);
+
+        ciErrNum |= clEnqueueReadBuffer(commandQueue[i], d_odata[i], CL_TRUE, 0,
+                                size * size_y * sizeof(float), &h_odata[offset * size_y], 
+                                0, NULL, NULL);
+    }
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    for(unsigned int i = 0; i < ciDeviceCount; ++i){
+        ciErrNum |= clReleaseMemObject(d_idata[i]);
+        ciErrNum |= clReleaseMemObject(d_odata[i]);
+        ciErrNum |= clReleaseKernel(ckKernel[i]);
+    }
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    return time;
+}
+uint8_t *kernel_bin = NULL;
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return 0;
+}
+//! Run a simple test for CUDA
+// *********************************************************************
+int runTest( const int argc, const char** argv) 
+{
+    cl_int ciErrNum;
+    cl_uint ciDeviceCount;
+    unsigned int size_x = 2048;
+    unsigned int size_y = 2048;
+
+    int temp;
+    if( shrGetCmdLineArgumenti( argc, argv,"width", &temp) ){
+        size_x = temp;
+    }
+
+    if( shrGetCmdLineArgumenti( argc, argv,"height", &temp) ){
+        size_y = temp;
+    }
+
+    // size of memory required to store the matrix
+    const size_t mem_size = sizeof(float) * size_x * size_y;
+
+    //Get the NVIDIA platform
+    ciErrNum = oclGetPlatformID(&cpPlatform);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    //Get the devices
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    //Create the context
+    cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+  
+    if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
+    {
+        ciDeviceCount = 0;
+        // User specified GPUs
+        char* deviceList;
+        char* deviceStr;
+
+        shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList);
+
+        #ifdef WIN32
+            char* next_token;
+            deviceStr = strtok_s (deviceList," ,.-", &next_token);
+        #else
+            deviceStr = strtok (deviceList," ,.-");
+        #endif   
+        ciDeviceCount = 0;
+        while(deviceStr != NULL) 
+        {
+            // get and print the device for this queue
+            cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr));
+	    if( device == (cl_device_id)-1 ) {
+                shrLog(" Invalid Device: %s\n\n", deviceStr);
+                return -1;
+	    }	
+
+            shrLog("Device %d: ", atoi(deviceStr));
+            oclPrintDevName(LOGBOTH, device);            
+            shrLog("\n");
+           
+            // create command queue
+            commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
+            if (ciErrNum != CL_SUCCESS)
+            {
+                shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
+                return ciErrNum;
+            }
+
+            ++ciDeviceCount;
+
+            #ifdef WIN32
+                deviceStr = strtok_s (NULL," ,.-", &next_token);
+            #else            
+                deviceStr = strtok (NULL," ,.-");
+            #endif
+        }
+
+        free(deviceList);
+    } 
+    else 
+    {
+        // Find out how many GPU's to compute on all available GPUs
+        size_t nDeviceBytes;
+        ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
+        ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);
+
+        if (ciErrNum != CL_SUCCESS)
+        {
+            shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
+            return ciErrNum;
+        }
+        else if (ciDeviceCount == 0)
+        {
+            shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum);
+            return -1;
+        } 
+
+        // create command-queues
+        for(unsigned int i = 0; i < ciDeviceCount; ++i) 
+        {
+            // get and print the device for this queue
+            cl_device_id device = oclGetDev(cxGPUContext, i);
+            shrLog("Device %d: ", i);
+            oclPrintDevName(LOGBOTH, device);            
+            shrLog("\n");
+
+            // create command queue
+            commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
+            if (ciErrNum != CL_SUCCESS)
+            {
+                shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
+                return ciErrNum;
+            }
+        }
+    }
+ 
+    // allocate and initalize host memory
+    float* h_idata = (float*)malloc(mem_size);
+    float* h_odata = (float*) malloc(mem_size);
+    srand(15235911);
+    shrFillArray(h_idata, (size_x * size_y));
+
+    // Program Setup
+    size_t program_length;
+    char* source_path = shrFindFilePath("transpose.cl", argv[0]);
+    //oclCheckError(source_path != NULL, shrTRUE);
+    char *source = oclLoadProgSource(source_path, "", &program_length);
+    //oclCheckError(source != NULL, shrTRUE);
+    size_t kernel_size;
+    cl_int binary_status = 0;
+    cl_device_id device_id;
+    // create the program
+    rv_program = clCreateProgramWithBinary(cxGPUContext, 1, &device_id, &kernel_size, &kernel_bin, &binary_status, NULL);
+    //rv_program = clCreateProgramWithSource(cxGPUContext, 1,
+                     // (const char **)&source, &program_length, &ciErrNum);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+    
+    // build the program
+    ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
+    if (ciErrNum != CL_SUCCESS)
+    {
+        // write out standard error, Build Log and PTX, then return error
+        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
+        oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
+        oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclTranspose.ptx");
+        return(EXIT_FAILURE); 
+    }
+    
+    // Run Naive Kernel
+#ifdef GPU_PROFILING
+    // Matrix Copy kernel runs to measure reference performance.
+    double uncoalescedCopyTime = transposeGPU("uncoalesced_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
+    double simpleCopyTime = transposeGPU("simple_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
+    double sharedCopyTime = transposeGPU("shared_copy", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
+#endif
+
+    double naiveTime = transposeGPU("transpose_naive", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
+    double optimizedTime = transposeGPU("transpose", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
+
+#ifdef GPU_PROFILING
+    // log times
+
+    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-simple copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
+           (1.0e-9 * double(size_x * size_y * sizeof(float))/simpleCopyTime), simpleCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 
+
+    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-shared memory copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
+           (1.0e-9 * double(size_x * size_y * sizeof(float))/sharedCopyTime), sharedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 
+
+    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-uncoalesced copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
+           (1.0e-9 * double(size_x * size_y * sizeof(float))/uncoalescedCopyTime), uncoalescedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 
+
+    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-naive, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
+           (1.0e-9 * double(size_x * size_y * sizeof(float))/naiveTime), naiveTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 
+    
+    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-optimized, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
+          (1.0e-9 * double(size_x * size_y * sizeof(float))/optimizedTime), optimizedTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 
+
+#endif
+  
+    // compute reference solution and cross check results
+    float* reference = (float*)malloc( mem_size);
+    computeGold( reference, h_idata, size_x, size_y);
+    shrLog("\nComparing results with CPU computation... \n\n");
+    shrBOOL res = shrComparef( reference, h_odata, size_x * size_y);
+
+    // cleanup memory
+    free(h_idata);
+    free(h_odata);
+    free(reference);
+    free(source);
+    free(source_path);
+
+    // cleanup OpenCL
+    ciErrNum = clReleaseProgram(rv_program);    
+    for(unsigned int i = 0; i < ciDeviceCount; ++i) 
+    {
+        ciErrNum |= clReleaseCommandQueue(commandQueue[i]);
+    }    
+    ciErrNum |= clReleaseContext(cxGPUContext);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    // pass or fail (cumulative... all tests in the loop)
+    shrQAFinishExit(argc, (const char **)argv, (1 == res) ? QA_PASSED : QA_FAILED);
+
+    return 0;
+}
diff --git a/benchmarks/new_opencl/transpose/oclUtils.h b/benchmarks/new_opencl/transpose/oclUtils.h
new file mode 100644
index 00000000..096612a8
--- /dev/null
+++ b/benchmarks/new_opencl/transpose/oclUtils.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+#ifndef OCL_UTILS_H
+#define OCL_UTILS_H
+
+// *********************************************************************
+// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// Common headers:  Cross-API utililties and OpenCL header
+#include "shrUtils.h"
+
+// All OpenCL headers
+#if defined (__APPLE__) || defined(MACOSX)
+    #include <OpenCL/opencl.h>
+#else
+    #include <CL/opencl.h>
+#endif 
+
+// Includes
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// For systems with CL_EXT that are not updated with these extensions, we copied these
+// extensions from <CL/cl_ext.h>
+#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+  /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+  #define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+  #define CL_DEVICE_WARP_SIZE_NV                      0x4003
+  #define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+  #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+  #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+#endif
+
+// reminders for build output window and log
+#ifdef _WIN32
+    #pragma message ("Note: including shrUtils.h")
+    #pragma message ("Note: including opencl.h")
+#endif
+
+// SDK Revision #
+#define OCL_SDKREVISION "7027912"
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0) 
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
+//!
+//! @return the id 
+//! @param clSelectedPlatformID         OpenCL platform ID
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print info about the device
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and return device capability
+//!
+//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA 
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" int oclGetDevCap(cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print the device name
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the first device from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the nth device from the context
+//!
+//! @return the id or -1 when out of range
+//! @param cxGPUContext         OpenCL context
+//! @param device_idx            index of the device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of device with maximal FLOPS from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Loads a Program file and prepends the cPreamble to the code.
+//!
+//! @return the source string if succeeded, 0 otherwise
+//! @param cFilename        program filename
+//! @param cPreamble        code that is prepended to the loaded file, typically a set of #defines or a header
+//! @param szFinalLength    returned length of the code string
+//////////////////////////////////////////////////////////////////////////////
+extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get the binary (PTX) of the program associated with the device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//! @param binary       returned code
+//! @param length       length of returned code
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram                   OpenCL program
+//! @param cdDevice                    device of interest
+//! @param const char*  cPtxFileName   optional PTX file name
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the Build Log from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
+
+// Helper function for De-allocating cl objects
+// *********************************************************************
+extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
+
+// Helper function to get OpenCL error string from constant
+// *********************************************************************
+extern "C" const char* oclErrorString(cl_int error);
+
+// Helper function to get OpenCL image format string (channel order and type) from constant
+// *********************************************************************
+extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    // An error condition is defined by the sample/test value not equal to the reference
+    if (iReference != iSample)
+    {
+        // If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
+        iSample = (iSample == 0) ? -9999 : iSample; 
+
+        // Log the error info
+        shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
+
+        // Cleanup and exit, or just exit if no cleanup function pointer provided.  Use iSample (error code in this case) as process exit code.
+        if (pCleanup != NULL)
+        {
+            pCleanup(iSample);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(iSample);
+        }
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/benchmarks/new_opencl/transpose/shrQATest.h b/benchmarks/new_opencl/transpose/shrQATest.h
new file mode 100644
index 00000000..93d2d9eb
--- /dev/null
+++ b/benchmarks/new_opencl/transpose/shrQATest.h
@@ -0,0 +1,238 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_QATEST_H
+#define SHR_QATEST_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// OS dependent includes
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+    #pragma message ("Note: including time.h")
+
+// Headers needed for Windows
+    #include <windows.h>
+	#include <time.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+    #include <unistd.h>
+    #include <time.h>
+#endif
+
+#ifndef STRCASECMP
+#ifdef _WIN32
+#define STRCASECMP _stricmp
+#else
+#define STRCASECMP strcasecmp
+#endif
+#endif
+
+#ifndef STRNCASECMP
+#ifdef _WIN32
+#define STRNCASECMP _strnicmp
+#else
+#define STRNCASECMP strncasecmp
+#endif
+#endif
+
+
+// Standardized QA Start/Finish for CUDA SDK tests
+#define shrQAStart(a, b)      __shrQAStart(a, b)
+#define shrQAFinish(a, b, c)  __shrQAFinish(a, b, c)
+#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
+
+inline int findExeNameStart(const char *exec_name)
+{
+    int exename_start = (int)strlen(exec_name);
+
+    while( (exename_start > 0) && 
+            (exec_name[exename_start] != '\\') && 
+            (exec_name[exename_start] != '/') )
+    {
+        exename_start--;
+    }
+    if (exec_name[exename_start] == '\\' || 
+        exec_name[exename_start] == '/')
+    {
+        return exename_start+1;
+    } else {
+        return exename_start;
+    }
+}
+
+inline int __shrQAStart(int argc, char **argv)
+{
+    bool bQATest = false;
+    // First clear the output buffer
+    fflush(stdout);
+    fflush(stdout);
+
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+        char *string_argv = &argv[i][string_start];
+
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }
+    }
+    
+    // We don't want to print the entire path, so we search for the first 
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    return exename_start;
+}
+
+enum eQAstatus {
+    QA_FAILED = 0,
+    QA_PASSED = 1,
+    QA_WAIVED = 2
+};
+
+inline void __ExitInTime(int seconds)
+{
+    fprintf(stdout, "> exiting in %d seconds: ", seconds);
+    fflush(stdout);
+    time_t t;
+    int count;
+    for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
+        fprintf(stdout, "%d...", count);
+#ifdef WIN32
+        Sleep(1000);
+#else
+        sleep(1);
+#endif
+    }
+    fprintf(stdout,"done!\n\n"); 
+	fflush(stdout);
+}
+
+
+inline void __shrQAFinish(int argc, const char **argv, int iStatus)
+{
+    // By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
+    bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }	
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bNoPrompt = true;
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bNoPrompt = false;
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    } else {
+        if (!bNoPrompt) {
+            fprintf(stdout, "\nPress <Enter> to exit...\n");
+            fflush(stdout);
+            getchar();
+        }
+    }
+}
+
+inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
+{
+    bool bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    }
+}
+
+inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish(argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); 
+}
+
+inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish2(bQAtest, argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+#endif
\ No newline at end of file
diff --git a/benchmarks/new_opencl/transpose/shrUtils.h b/benchmarks/new_opencl/transpose/shrUtils.h
new file mode 100644
index 00000000..45ace670
--- /dev/null
+++ b/benchmarks/new_opencl/transpose/shrUtils.h
@@ -0,0 +1,642 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_UTILS_H
+#define SHR_UTILS_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// reminders for output window and build log
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+#endif
+
+// OS dependent includes
+#ifdef _WIN32
+    // Headers needed for Windows
+    #include <windows.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+#endif
+
+// Other headers needed for both Windows and Linux
+#include <math.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// Un-comment the following #define to enable profiling code in SDK apps
+//#define GPU_PROFILING
+
+// Beginning of GPU Architecture definitions
+inline int ConvertSMVer2Cores(int major, int minor)
+{
+	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+	typedef struct {
+		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+		int Cores;
+	} sSMtoCores;
+
+	sSMtoCores nGpuArchCoresPerSM[] = 
+	{ { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
+	  { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
+	  { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
+	  { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
+	  { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+	  { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+	  { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
+	  {   -1, -1 }
+	};
+
+	int index = 0;
+	while (nGpuArchCoresPerSM[index].SM != -1) {
+		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
+			return nGpuArchCoresPerSM[index].Cores;
+		}
+		index++;
+	}
+	printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
+	return -1;
+}
+// end of GPU Architecture definitions
+
+
+// Defines and enum for use with logging functions
+// *********************************************************************
+#define DEFAULTLOGFILE "SdkConsoleLog.txt"
+#define MASTERLOGFILE "SdkMasterLog.csv"
+enum LOGMODES 
+{
+    LOGCONSOLE = 1, // bit to signal "log to console" 
+    LOGFILE    = 2, // bit to signal "log to file" 
+    LOGBOTH    = 3, // convenience union of first 2 bits to signal "log to both"
+    APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
+    MASTER     = 8, // bit to signal master .csv log output
+    ERRORMSG   = 16, // bit to signal "pre-pend Error" 
+    CLOSELOG   = 32  // bit to close log file, if open, after any requested file write
+};
+#define HDASHLINE "-----------------------------------------------------------\n"
+
+// Standardized boolean
+enum shrBOOL
+{
+    shrFALSE = 0,
+    shrTRUE = 1
+};
+
+// Standardized MAX, MIN and CLAMP
+#define MAX(a, b) ((a > b) ? a : b)
+#define MIN(a, b) ((a < b) ? a : b)
+#define CLAMP(a, b, c) MIN(MAX(a, b), c)    // double sided clip of input a
+#define TOPCLAMP(a, b) (a < b ? a:b)	    // single top side clip of input a
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0) 
+
+// Standardized Exit Macro for leaving main()... extended version
+// (Companion Inline Function lower on page)
+#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
+
+// Standardized Exit Macro for leaving main()... short version
+// (Companion Inline Function lower on page)
+#define shrEXIT(a, b)        __shrExitEX(a, b, EXIT_SUCCESS)
+
+// Simple argument checker macro
+#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE 
+
+// Define for user-customized error handling
+#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
+
+// Function to deallocate memory allocated within shrUtils
+// *********************************************************************
+extern "C" void shrFree(void* ptr);
+
+// *********************************************************************
+// Helper function to log standardized information to Console, to File or to both
+//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n"); 
+//!         : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
+//! 
+//! Automatically opens file and stores handle if needed and not done yet
+//! Closes file and nulls handle on request
+//! 
+//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.  
+//!          LOGFILE and LOGBOTH may be | 'd  with APPENDMODE to select file append mode instead of overwrite mode 
+//!          LOGFILE and LOGBOTH may be | 'd  with CLOSELOG to "write and close" 
+//!          First 3 options may be | 'd  with MASTER to enable independent write to master data log file
+//!          First 3 options may be | 'd  with ERRORMSG to start line with standard error message
+//! @param 2 dValue:    
+//!          Positive val = double value for time in secs to be formatted to 6 decimals. 
+//!          Negative val is an error code and this give error preformatting.
+//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.  
+//!          ALL printf flags, width, precision and type specifiers are supported with this exception: 
+//!              Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
+//!              Single byte char type specifiers (%s and %c) ARE supported 
+//! @param 4... variable args: like printf or fprintf.  Must match format specifer type above.  
+//! @return 0 if OK, negative value on error or if error occurs or was passed in. 
+// *********************************************************************
+extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
+
+// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0, 
+// *********************************************************************
+extern "C" int shrLog(const char* cFormatString, ...);
+
+// *********************************************************************
+// Delta timer function for up to 3 independent timers using host high performance counters 
+// Maintains state for 3 independent counters
+//! Example: double dElapsedTime = shrDeltaTime(0);
+//! 
+//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
+//! @return delta time of specified counter since last call in seconds.  Otherwise -9999.0 if error
+// *********************************************************************
+extern "C" double shrDeltaT(int iCounterID);
+
+// Optional LogFileNameOverride function
+// *********************************************************************
+extern "C" void shrSetLogFileName (const char* cOverRideName);
+
+// Helper function to init data arrays 
+// *********************************************************************
+extern "C" void shrFillArray(float* pfData, int iSize);
+
+// Helper function to print data arrays 
+// *********************************************************************
+extern "C" void shrPrintArray(float* pfData, int iSize);
+
+////////////////////////////////////////////////////////////////////////////
+//! Find the path for a filename
+//! @return the path if succeeded, otherwise 0
+//! @param filename        name of the file
+//! @param executablePath  optional absolute path of the executable
+////////////////////////////////////////////////////////////////////////////
+extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing single precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing double precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing single precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
+               const float epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing double precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
+               const double epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data, 
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len, 
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PPM image file (with unsigned char as data element type), padding 
+//! 4th component
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param OutData  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! 
+//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData, 
+                             unsigned int *w, unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+//! Save PPM image file (with unsigned char as data element type, padded to 
+//! 4 bytes)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data, 
+               unsigned int w, unsigned int h);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Save PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data, 
+              unsigned int w, unsigned int h); 
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
+                  unsigned int *w,unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+// Command line arguments: General notes
+// * All command line arguments begin with '--' followed by the token; 
+//   token and value are seperated by '='; example --samples=50
+// * Arrays have the form --model=[one.obj,two.obj,three.obj] 
+//   (without whitespaces)
+////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////
+//! Check if command line argument \a flag-name is given
+//! @return shrTRUE if command line argument \a flag_name has been given, 
+//!         otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param flag_name  name of command line flag
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv, 
+                     const char* flag_name);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv, 
+                        const char* arg_name, int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type unsigned int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv, 
+                        const char* arg_name, unsigned int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type float
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv, 
+                        const char* arg_name, float* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type string
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv, 
+                          const char* arg_name, char** val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument list those element are strings
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  command line argument list
+//! @param len  length of the list / number of elements
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv, 
+                              const char* arg_name, char** val, 
+                              unsigned int* len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparef( const float* reference, const float* data,
+             const unsigned int len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparei( const int* reference, const int* data, 
+             const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned integer arrays, with epsilon and threshold
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
+            const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned char arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
+              const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integers with a tolernance for # of byte errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays witha n epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
+              const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality and a 
+//!     threshold for # pixel errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays using L2-norm with an epsilon tolerance for 
+//! equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
+                const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PPM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PGM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
+
+extern "C" size_t shrRoundUp(int group_size, int global_size);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    if (iReference != iSample)
+    {
+        shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile); 
+        if (pCleanup != NULL)
+        {
+            pCleanup(EXIT_FAILURE);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+
+// Standardized Exit
+// *********************************************************************
+inline void __shrExitEX(int argc, const char** argv, int iExitCode)
+{
+#ifdef WIN32
+    if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#else 
+    if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#endif
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");                  
+        getchar();                                                           
+    }       
+    else 
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]); 
+    }
+    fflush(stderr);                                                         
+    exit(iExitCode);
+}
+
+#endif
\ No newline at end of file
diff --git a/benchmarks/new_opencl/transpose/transpose.cl b/benchmarks/new_opencl/transpose/transpose.cl
new file mode 100644
index 00000000..c0dd6e6b
--- /dev/null
+++ b/benchmarks/new_opencl/transpose/transpose.cl
@@ -0,0 +1,108 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/* Matrix transpose with OpenCL
+* Device code.
+*/
+
+#define BLOCK_DIM 16
+
+// This kernel is optimized to ensure all global reads and writes are coalesced,
+// and to avoid bank conflicts in shared memory.  This kernel is up to 11x faster
+// than the naive kernel below.  Note that the shared memory array is sized to 
+// (BLOCK_DIM+1)*BLOCK_DIM.  This pads each row of the 2D block in shared memory 
+// so that bank conflicts do not occur when threads address the array column-wise.
+__kernel void transpose(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)
+{
+	// read the matrix tile into shared memory
+	unsigned int xIndex = get_global_id(0);
+	unsigned int yIndex = get_global_id(1);
+
+	if((xIndex + offset < width) && (yIndex < height))
+	{
+		unsigned int index_in = yIndex * width + xIndex + offset;
+		block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	// write the transposed matrix tile to global memory
+	xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0);
+	yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1);
+	if((xIndex < height) && (yIndex + offset < width))
+    {
+		unsigned int index_out = yIndex * height + xIndex;
+		odata[index_out] = block[get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1)];
+	}
+}
+
+
+
+// This naive transpose kernel suffers from completely non-coalesced writes.
+// It can be up to 10x slower than the kernel above for large matrices.
+__kernel void transpose_naive(__global float *odata, __global float* idata, int offset, int width, int height)
+{
+    unsigned int xIndex = get_global_id(0);
+    unsigned int yIndex = get_global_id(1);
+    
+    if (xIndex + offset < width && yIndex < height)
+    {
+        unsigned int index_in  = xIndex + offset + width * yIndex;
+        unsigned int index_out = yIndex + height * xIndex;
+        odata[index_out] = idata[index_in]; 
+    }
+}
+
+
+__kernel void simple_copy(__global float *odata, __global float* idata, int offset, int width, int height)
+{
+    unsigned int xIndex = get_global_id(0);
+    unsigned int yIndex = get_global_id(1);
+    
+    if (xIndex + offset < width && yIndex < height)
+    {
+        unsigned int index_in  = xIndex + offset + width * yIndex;
+        odata[index_in] = idata[index_in]; 
+    }
+}
+
+__kernel void shared_copy(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)
+{
+	// read the matrix tile into shared memory
+	unsigned int xIndex = get_global_id(0);
+	unsigned int yIndex = get_global_id(1);
+
+    unsigned int index_in = yIndex * width + xIndex + offset;
+	if((xIndex + offset< width) && (yIndex < height))
+	{
+		block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if((xIndex < height) && (yIndex+ offset < width))
+    {
+		odata[index_in] = block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)];
+	}
+}
+
+
+__kernel void uncoalesced_copy(__global float *odata, __global float* idata, int offset, int width, int height)
+{
+    unsigned int xIndex = get_global_id(0);
+    unsigned int yIndex = get_global_id(1);
+    
+    if (xIndex + offset < width && yIndex < height)
+    {
+        unsigned int index_in  = yIndex + height * (xIndex+ offset);
+        odata[index_in] = idata[index_in]; 
+    }
+}
diff --git a/benchmarks/new_opencl/transpose/transpose_gold.cpp b/benchmarks/new_opencl/transpose/transpose_gold.cpp
new file mode 100644
index 00000000..db7fbee5
--- /dev/null
+++ b/benchmarks/new_opencl/transpose/transpose_gold.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/* Small Matrix transpose with Cuda (Example for a 16x16 matrix)
+* Reference solution.
+*/
+
+////////////////////////////////////////////////////////////////////////////////
+// export C interface
+extern "C" 
+void computeGold( float* reference, float* idata, 
+                  const unsigned int size_x, const unsigned int size_y );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute reference data set
+////////////////////////////////////////////////////////////////////////////////
+void
+computeGold( float* reference, float* idata, 
+            const unsigned int size_x, const unsigned int size_y ) 
+{
+    // transpose matrix
+    for( unsigned int y = 0; y < size_y; ++y) 
+    {
+        for( unsigned int x = 0; x < size_x; ++x) 
+        {
+            reference[(x * size_y) + y] = idata[(y * size_x) + x];
+        }
+    }  
+}
+