From 6cbfbfb856e7759de035639b4d5dba17e82f0c0f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 14 Apr 2024 21:25:14 -0700 Subject: [PATCH 1/4] sgemm_wg: Output CPU data to binary --- kernel/src/vx_spawn.c | 1 - tests/regression/sgemm_wg/main.cpp | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index 9ea45ded..ffbbaccb 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -258,7 +258,6 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg // threads, handle this in the last wave amongst other full warps. if (rem_threads_in_last_warp != 0 && core_id_in_cluster == 0) { // adjust offset - // FIXME: consider cluster_id here // FIXME: use rem_threads_in_last_warp_this_core wspawn_args.offset += (num_tasks_this_cluster - rem_threads_in_last_warp); diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index 93152896..709d804c 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -119,6 +119,14 @@ int run_test(const kernel_arg_t& kernel_arg, file.write(reinterpret_cast(staging_buf.data()), buf_size); file.close(); + std::ofstream ref_file("reference.c.bin", std::ios::binary | std::ios::out); + if (!ref_file) { + std::cerr << "error: failed to open reference.c.bin for writing\n"; + exit(EXIT_FAILURE); + } + ref_file.write(reinterpret_cast(ref_data.data()), buf_size); + ref_file.close(); + // verify result std::cout << "verify result" << std::endl; { From 689043b45e9edcb8808c2962a08dd3ac681a977a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 16 Apr 2024 15:22:03 -0700 Subject: [PATCH 2/4] Add regression flops --- tests/regression/flops/.gitignore | 5 + tests/regression/flops/Makefile | 9 ++ tests/regression/flops/common.h | 15 ++ tests/regression/flops/flops | Bin 0 -> 37976 bytes tests/regression/flops/kernel.cpp | 41 +++++ tests/regression/flops/main.cpp | 252 ++++++++++++++++++++++++++++++ 6 files changed, 322 insertions(+) create mode 100644 tests/regression/flops/.gitignore create mode 100644 tests/regression/flops/Makefile create mode 100644 tests/regression/flops/common.h create mode 100755 tests/regression/flops/flops create mode 100644 tests/regression/flops/kernel.cpp create mode 100644 tests/regression/flops/main.cpp diff --git a/tests/regression/flops/.gitignore b/tests/regression/flops/.gitignore new file mode 100644 index 00000000..c791df5d --- /dev/null +++ b/tests/regression/flops/.gitignore @@ -0,0 +1,5 @@ +*.bin +*.dump +*.elf +flops +.depend diff --git a/tests/regression/flops/Makefile b/tests/regression/flops/Makefile new file mode 100644 index 00000000..b5d37285 --- /dev/null +++ b/tests/regression/flops/Makefile @@ -0,0 +1,9 @@ +PROJECT = flops + +SRCS = main.cpp common.h + +VX_SRCS = kernel.cpp + +OPTS ?= -n16 + +include ../common.mk diff --git a/tests/regression/flops/common.h b/tests/regression/flops/common.h new file mode 100644 index 00000000..a609a0b4 --- /dev/null +++ b/tests/regression/flops/common.h @@ -0,0 +1,15 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#include + +#define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000 +#define DEV_SMEM_START_ADDR 0xff000000 + +typedef struct { + uint32_t size; + uint32_t addr_src; + uint32_t addr_dst; +} kernel_arg_t; + +#endif diff --git a/tests/regression/flops/flops b/tests/regression/flops/flops new file mode 100755 index 0000000000000000000000000000000000000000..dfd6a6c8f97beb7149347256f244d09478eb282a GIT binary patch literal 37976 zcmeHw3v^V~x&NLd1Cd82DuGx;1`SFI!~_UUsF4IRa7HH>NkDwiVVF!vN|K2)6M6iL z1_Rb{n2OfhdaL%2y|!{&+tS;|RjeAS80__dZS|s6q&`nT1cM@4b^gC^KTggZ<|w!8 zuK!y1uAH@IzWsfBf6x8xefBwL=Ik3hz6Dth2UABjyORj|?c zp2Xa281NiU6ZO>spjI6FveIF$q-O&XUm+DTz_SIJDX1(YNPKoj#aW_8L6$FY;&W2b zO9JZ2-O!G`|NHg;ZbLqE?Z&hOjA zbA^qZTW##`v!T~xAY}6QWgB|4jov*rak$^cFNcjiH`&NPVH1aQZR8xYp|7!#|AvkI zGi~Jj%qBk9+t4d)=$F~}^}bDh{?5k!vu)_BY~p{0jr@QOJrD75<6}MM*ywGxiQA(# z?Y|Gfney&c7zm8Y2w6{$N`;3e8t7y0jry9*nK)40(f|il(rCO>0xA!xN-- z+#tme?hJL1OVLG%PE{3p~LMwzJ6z$$de`BfN(=Z#sh6~|J zf1SUzBN~dtyzL&(;)Zaes4(~_jw`#GGm3}B2&J;eB&umpRO3?EP%dl`CfG?6K{6bj zt7TtG(|(0D^o+|L8eH3Q6kk}+A!1K#+?k$?CgqR1X%_b z46hTT3p!e3Dq+gkL|S7Z@=b7`TG-Q)j>ZZN3k)lA6~oBLm~eS>TUWG24$Zcx7tUIT zTn4v-jTjx}elch}*0jrje#zM7-p9VA(JbzKJ?zLZ9?JY4A;icNJ^<#6x#biGfNYJK zXqL*ta97M9ZuUn49jikA=JpsITMtE)xu_Bv8EMui$l?__7F|FK{?<0J0q# zdMu1o;)5D9=UUA9?VW=Sbq==08-zib)Gvr1;9qh)1jwUrF*YUX+UWhFDs6{75)Q8LRYl?@hW zl$0A}=)g%e3)gnpxGtn`C;wJ=GKK-8W4NppZKxfc<%hA;BwcZ5vk{zs80bzz)=1E> zc;LriOI3169S&ScPQzM)u3g>u)Stf&cQ8h>y`a(IfV{r|QvC{iI!}BbD~SGPL_j(1<8mN-wr3TnM*D-9I(+V+oGC{X>A?N;OBVR z{*K$ehuYz6vc$W-(s~-jsZ_~3i|{OVk(8(ItElx*XDLaA-Euvo=-WyJeXWIFD%S6aA=( zKGsA(VWN*S(U}|<)F*`|daj8sZw>R>d=vdEruKy<`gjxFZKA7t(o{XgM4w=4UuvSC zW1?4>=o3x!Y7^aUqH8AlBon>fL_gO=UuL48XQHn#(Thy<788B4iQZ|VpKqeCHPJ6H z(YsCbDJJ^OCVH`nzSTshYjbtnX`)Y6M8It(`ZN=LyNNERcwW21M3;{;IQ>Brz04%1 z*F>LTqVF-$=`22U3`Jll0z(lPioj3={x?M6nCqNRw4Qf!wS@B}KV#bF-dI*@uh#QK z?&I9ORQVa-U~C{Y`Ah)UL`LmfsAi!5Z>dyjE5~VJKG6TXfzyI~pnsQv(?Wcp|5pZ1 z3-E#ddkvfx-UI!&88|Jt2l_V}I4$Z1`mZ%`T3`?K2MnAR)&u>Q8#pbf2l{IaoEFjp z{bdGD3+RFVNd`^}=Yjsy4V)It1N{yIr-kxB|NEaSe`$dX{|%fL#_-?3X+aGC4V)Ij z@ZZ2`0Sx~QoEEhi7N+puz-d7W{|%fLqVV6qX#ooV4V)I9@ZZ2` z!3qBjoEDn!-@s{s3I7e87MAeez-d7V{|%fLlJMWaX#olU4V*5R;lF{?LK6NPI4vO2 z|DP)V>B1iUuW*cx&DYTAsAYV706%LlP&lJ3tnKsM_X{G z1^?{I^#1+Of*-cvZ&~nHE%*x-{22?r+k!u0!GCMPe{R8lY{Bob;QwU7lNNll1>a!7 zyDWIS!nMS=iVF2+OrgXkOH}Jx{O%l4{#H@7mUN!Lk0GdD z!+AshJ4P7^snFlYe5A#X=<6U%{{UotE=v6eFxyfCx6Xu?%Y~N3TuW_Hy-;yY4hr8Y z(jfm%B_HQ4{VtLZsppZ@AClC5=;=>kPN(*z`g%gwlW4SR2jYvrqkqWZNBVL=qQ3=b zjQ^UYHA`w5YvP|S(UQ{>n4B84_%UtcyZJcpHED^Q$uxhFgSjA-^=k2hT6}*kq#pTu zD%Eo$68N>Y&xwvMx0a~dw2ZNxbe=0au5U+8 z+^bn;o;|tod4>8GG7H`n87h+H zPvZPXo~{rwo?1#WM%v5Jq>KX}!_09|594l^GR`I$ko5%WYARoi@7qa>p&Hlx-CAOF zA3C?^#3(KPtZUvMv_!!#fyaBb;up2bqpoj1jp69J@fY}3VOl~$lT{TkNPnD^Cab1V zc@=~|L~?FAft!U;;lzRaXf?EiSf20);=zXL&QBVp>qWm0A??>KNprqf$ zZ7O&Z$tHqy6=u<$vxHUWh#=jMsZXCl(M&k+LOD_X61oM^`wkE-yG2iURLXe@^|K&R zg{T=zGdaEnxDg)8JYW1h8Cxy+KoPQ?rG;tVM?6e3IB%bRCCM8j3~Jz^JEC`iMqxsH z5T-vPTE6%}U;H?w;!~su&_Mr<-N#d@M8SGUiSI}KwJ7MLoYc0U z&Jgr4WT@!IgrHnv{;m-8>0*?>20^OLGSTKuqK8GBeN;XZZIA=A$n?#rm{Y%>EPnyR z0zwy%hnIuI+4VkA$wXx_Rn7wG(BJ6l(HnoF|6WMpSEBLV{Xa&zxx_Vb6BLW}G`~Jg zR&9V%eH4WxQB`$0=_y#iL3xmBenh4v3uX$^EV8iASqwl2Y+8&78eQK2B1gG%HHa z6S@6|KQKEhudmAvd*jc4LyNzz^}KnszR{O-{tlg^B}dQYpz2PDC1K8wA2CD#BTE9aS74Bn71z{ey#X(Eq+wn_1;`<*NJS+@uc?r z=dm#m@PZVOn>rxIk>aOb>aOZSMzXG{OSGPsj(piUg zj^*Gr^b>#o8dNHKaPNUnl?Z$c*O{=dpM=B^7?fY7a^W#vejeqWnBb_7Jd;4L7i}H_ zoo-n4ABAF`MTx4>P)rG$+rJteWYI70cYKUY@g+td1n|at>x%bycb&-cRrak}9r4y7 zc;&}j`J?}W;aK+6;C#~TI?{VgbA7eM_^p${ z{MB~7ljTj0snja>#(t{pN@Z#Bf^M$y#GxO+E-n5!+-xROil5Zq!^sFHUBNALE(4He z^{?h3tLE32(*7%XtI?MLq}$&R9dA~f2bI&&h_rj-CxkcTpd0!J{W;l}H-s-@{mAXy zkDMVp7YI9_BQ)XM3n1-0bZ_&Z@2-i$-kK|X$*MCKVVNNp#vh;u@4!umCGi({-(!9J zcSPZ69gN)gZVRTbv}N)#KA-6UbT3l#kJJiR4RwhJh95;X)NMZO+O!Bu;hOFZ70k8i zlU!QQJlZuM5`T!c`nsdw!|A)`ed4bgpp^2UOpCtB;M;ko$f-H72t3~;o)39j=QlFx zdL5?)scWf8*9_ttEp@#>Fi~|AbUn(hIp3+@2LdUaK?=tUg|Fg13>1Fs1LG#4@G#0J zId|=#!k=R)qnz7Ce0Lua&NV@Pq5(^(@=p+GoYkN`mpmOW75@T4c3vbD{}MNNptzd^ zj1-DvC?}%10zi-XR0|c-z(PQPeO?|;elSP72Xz%2Ypy8 zXgGZ+0engI0NI-?9|m3_2OV=WxWTE#uPLhV#oLNB{rG#-%6Y`Y2Om}mxEO>a^nPps zW6kj`MN_Z{(cJt@T)za?K=;Gc zj4SBronjb}tSwrGT%rm$mXdlB?qC@M>-96hrSYnYQ= zn_e0Qua4`tqdk^HE$|CVqEFDV=)5YPaJbs_#b4KVaBjM?eey8m5HEfi!+2b2p_#m! zUqZ{!l~U-c^@pix()pWODEU*F2pwOQIx7*)=J*{&Q!t#!A>;BJ0dRw6L{F7v^c2dY z_h~JdDSC=Z(NkqaZxrue=LYC9bf^Gwn>Q6LL$GBG3n_+PbheD)vnZ+<9zRRO@MUr+ zafdM16vi8|(n1(7g+i=?S|GgVfD`>P&9!+eB8*_>Qzp|jJOwjIDFviZU-zyI=IxN6 zE=a!zZAkuhFNVYkuF)AlS&YT|r@(rMr9qy|w!uk->VQ&7Ru{2L=NsmMOdRtMxYf~*{dVDu7 zPjC&;2nMbA58AFzvb4&BuA44{%{VWO+K#WBpB;0ZURyNknCqHdDEW$VkGa0F%hl+- z0OR|Z>nbdPFLX{t`MP^|o=)nL<&R)p4P$T8VC?B|)0-HsVR4_Vih*Jo@O->}dI*i; zPiP6^z75>C`ytxBuEsS@41B4qKL)qygn~9u56koo(6Mu*(7l{=$NL~n|DL5`uHHz- z2t8tM&oSoqb_%nY+bNKI`lEX=_Z}3%ZiR_Gh^S*QhMRwn`!Na7Xv*D4@k}}o&LaYGk~Kww34a1ES$Yyr-S7q5%o7#zo0X zR93?I?X4~1_Eyq)j;!K$wkDAd@VIH>TLud?ZWYaG$|MKW{;^-0f+*@qQb#uM>}e}n zrauhtF{@o84bYHw%eWRbH^%ACSJL?})xs*8NA;hOOpF>jvl`b9uFaz{h!C}DxQ3vU zu)2b%@js45;3lY~?mwdIFq~$4tVj=o<58T_;Cyu{uC830R`NSzM^bxTj~?w`0z|AI z)eW4cv1psDa$+()Dmyj#Aj(ro(fL&{>FpI*}bH;5JQGR_b;q+0jNMTSPh z_!uOKJp4XNa*6SE;ud>7F+}vd>eoa@@W=6I=dPA-o}NO!AF;RitimgpY!Du;VP;C zET(>iooC@j3Pzo2dGTf)!bcRt?HA4F8V2`={$L?&JC zDxNP=?o~i2FV)@PQZ0L}o>;?GtR#_j{LTqaZOp-@+FGi89pR%p2n(q@c!C!APEx3m z4i)gPbA%?q`A92V;NA^Z{Q~+rm%g5P2Vb}mBrb>#pD}nrJb*IVk9wO_Roz4R!Y_#b zj>eB+v7XYOC*D`+>p$^@%-t^Xthg2#GYcefar{$Jd9|n z9h?ui$i!UoeN_Kk1VgqnUoUE^exgnN=Hsz7+CvmH920A<5Z4J_+(74d&g0LDE1oN` zyj_^D#b3~-UZ@rC_2R^ZD>m=0&$E2R$F<7G*UTyF?aR3lV-MGZy$(>iaEBx32VR_J zbLm>fG@+^aOxdYb?swgI2RMC|$LiuA`dsr5)j2-&RzBl%E$sL1dMk?tUlghu=gd;9g8sDbiD)2+VwWgCfc#P7%V? z4I+l#_-8eZGKMdT7+xl0_>5M(hllY!3S6-kf0@>b&*`tc0mY8Tz47JNhzWmrobD!F zm4~#ruDRyp{*sPmTx2;GsB4#XUFj)1?kRf{0=>zLreT403nbPhHZX5uNv^N>6AH>D zb(>RN^_t^QT|C6Rj#uj9UHK?lHPX6B^^Z6HA}y##BLY|^cpXn3x<}5FpVL}kGJ1yM z_6U?6`U&F2Fr4%wuV;)GzaCv(iPOSTh^<-Tjelr7u4w|f!B_klMN5BeKZ3FAv+UT! zvcJl__QeNqHxv(RYEolxt^YWB{nf66mgF*B15^Jzj&SjQqTg8{4GAAQh9WQ&fuRTt zMPMibLlGE?z)%E+A}|zzp$H5`U?>9rKO^8^S!^kPzl*yu6pfYOpZ9owiyJRjX$|t1 zu5f&cz>ArO-hwj6eZl1D1@1^F)`j=8G`S}?O&Y=IwJh%DaKs%9w|BOMVy)p0))ZdT zK`(NFxM&Rj%_pK8?+j@UMc6cV6#vw(V(yjeVxi~=hBp}Cg%c$!@gMz22n|9V!4Su; z3q@L+*Fjsft1ZU(YgpWYn8e-Zu{rKQFxC}ltAfbT+Rjig7HXXAq?W2>} zf*P;SQ#*-u%vn9BWiDPD5bBUl!r+b$aFN0CwK7yY<$UF{FqoS=2NJoxOuV7S-5Ck5 ziUitOppD)i6ALMscmY8;vW`K7KhV?^@kb-Us?xQCcvCb+xWR@fSHvOp865hlBPf`Fr|2!RP2X)B|KV)&NK>zpHhUR;VFeoB?^*A1^0Y+ zdmt8RUF#+tGVs$RbfW{@Br6n+RuMnXgE?+`xdbvc7IyOt7)&~zy7UqgD4^#VncJsj zrj>=%uT3UlG?$)S#^7^h~_r+E2SW9bk`o-ks zlwwOYm&W6G^2?BnV47Z#B*J9GUA$b$Xn3h-@gk4UU$c0jzt*$VU+1aw*VNW7#vl}l zm)0F*Pq=xC6pmi;$BLzRnfKHSDl28qQcT5&vF!tLDRW#^VxJsRl}8qT25l1=gYjkK zOC(6X8JQ%4N%lv>U6CLLEOPJ^Y|m`BX&hQiKSjHFrf2E{RHWIB!R)8OZ0Z%n+0-a) z6p3+T%EhKoa{$w)KhTM(t%>K>dfYw6&vR2L`44AKWXZ$7-S*LwJ1%yN&;I%YAmNz< z%l%_2bpUY6GpSTAo~Kj*x&gJlsniE}P;}<=snn-<5Lf}Y3Gn89snnPp#@4=+O6>t( z%gd=$4DBBT+yb}{un>HA{xy}F1*pA}N_7I#bCwF!PXVOoHX7gxz!t!5fZOp1V-)1? z!(+cv$k*^xa4jGmWG5Yt^^2KfZN6jtsNuO=!9#TX)`V?`O&3yw+Tr;Gitw3*kNYY3 z0+}o?e?ea1C9aWca=Y2wugv-C%%XEa<^0zG7CxOyaYiEIAh|Jo@`+CT@}M?b@Y#nO z`1q{`KZv#spTB^HpBeCjXurYdfQd#jDPHfw)_Oozo0p%pFfW(d9>Avv@l^bz`vZJ> z!CwYH*^<2cTeIfn6@DjsUY;3d3m|s$De%qnP;EV@~V4=-!yECGnw<9>|3+4){n^4xOOCew?Z5AJqR0r54B8# zjmt;mmy%B!`E+^Sw$rjV56@`nCG?wvY534#(pfyH6WnH((T?Ij3m+fy`#%74 zE`SXuWO_wQyusAX8X98T$aCg(?~k55S%UNT!^h z0YOZ@L;akje%_K(n>Xbqr;d~?h zVC@-plXFW>GW$DOJPxI3>!NWEm_vgQ2Y$c7=yx`4e@Ug@AOiLMi+QYo#p*w*~7zJ|2&Qz8SYGuV?WAu{&*bwd2Yq?W7*B8y_WsRIQF~I&W+<7yYif` zj&=Om^?$QK>N$g+;s5N+3s8RhOefkNKhp_GJ!38f<()B3$opW72jzzfV8`PHP844& zz*D$X!2}e4GnQJ2xTNkn-ErDtwA${#m!zc}#fyK0PP;94H2$4(AopR=|82zRU*xfu zNBjUz+>+-+`Tjg7YEvmk&PY748ajp|Fcg8I2ndA91co9o6oH`#3`Jll z0{_D!px(=?-ovZjyZb-ft^alAi@O)tW;P&mEQnT}BcFydqi$}XoEwV!H&v$ zNoE>u#n3_b!swuTS#((Mx=1!wdcCL@cK~s(+H#*+ieuwMGq#Ox9^z2+Ui>|P4rTWN z8K~0)+0c)FXYoU|!yhZSe`wB0_l~8T|l7FtS!!2Q{gw+z(OSnS9P6@ju+$!NV33o`? zE8$)V4@h`aLUx|?UqUyeG$AXH?eXR~D)={bz{+p!+>UVSw$nCr4J zE2#M(oj#nYc_W>k%hY&Er=P~uI83LHU~0Ul(?=TfLOLC@q}xmpac?6)v?9fo9_g!Jdrt0!HtIvL4xN zEK~Enxh@;Cr<&)}>4nDrN9pvlSf=?W8)pqQU#GXn*`?b|5#?;tK1$}=Y&OA+^0EWp zIqdzEwSvwoSvYHT(}NNmDtvT4Bf5$|Jx8MBVtj}`^#YC>-LFZyS{J<`s#y@w0XwJl z2m;wbM+Bct_G|#1i@iZrI8Bc?>7mELw%Ql_dgxmCHXle`azyV^7LM z`V%(GkhfQGIVa~^JLniiCmr9mp>G5IT=YvdB?b<)E_w*`O#6SmYa?fBR%X3(LC1YC->{(&4hxRYa85!`E~nVj zoPIXTmwB$%WiQysIR^TOwlU`lMQ_24i(hK|5J|u3Tp5W4Sf;f zO!FtR*gqHZa6aUh%JHK7x|;Wk!Ukd3hMu&cKLa}DdGA~yV-oby@sUmYnK<83KI|wH z?d}zT)qzfSTH{YI=gbthEuho5Q~UgMOF2*2wD0G1%zM?MVMw6&b3<9a*{1zAPRDt+ zMl?iv^J53-WKXv||CY&qf7wP(7EVB!?3`pnF9F>xb(ukx4gCtxsb98j6s^@b>adZ| zXm_=eU}q<`q4SlMMq99RW*7aon6^x;P`_SWd}-N?!JlsWW8eH{Nif{jB$=t?U+BAR zUX2eM!ga0md((CRrJe446@F~GNt^GnU?die;kV=^K{mJ}sXx|En}T+PqS*Tl`+%-$ z3$F~c`J3>Y+^9d$wU)_Wr-zzKDoST#le$ceu>W93D}Id}39Q5Jb}{^pyBWXn4*8q9 z+S}KG#f0%q5Yw9a{RCZs{I|)*~JEagLczadHxXn4)=>wCEbS>_|Rhb zVBM56U1R#Do#eKq-g_DL2yDX6palO?v@~%?Z_>gX#Y8y$1Q# zjQ`Mvt`_r<__q7erZ>gUj!*u%xct38{k|SCoH27%s3R1?pB1p7uKc3~ceh<`rfBhm z@4A>y!>Ead{VmwJQEbec_Dg@L_?rBq0ge4r-!%3lPfYv$VuMTk|DpDZ5|~_qorW`Q zmYR7B+%%~rl{|@z{Y29$F^%v)z5EyUjZNq0X{N@Q%1_$#6TKpabNa8sX`4#yVJU@R z1Y`e5%XX{gy^PJ9VGi=}(L>t>cHC?X% zD(ul}C7q&Uv|DN>)>GO1*JgWDi#W>;rLvZ9|A}6uKl;!=N-#BJQ*1T)|95W6n`Vy4 zWAmohC+*9d-X43Jsx5-WuGYqm#%b+%Bfgugm5jk<+T-`++AlkUeA$M%7KuFjib5gu?lBS-m}=!NNlMajCS$Wic)qeA+QKK)k*8* z$eI2y!;Lm?eXjN^Pg92RBDVyVWR*tz1aItMY&pH3e3nUXKn|$sUM;DNJ)Xa0MTNx6 zv_pKF)xtO08{R_pHc~1^u-ADs7K&iPp%rgqU?tAR>ANn2PwsatcS5Faqt&hGYm4z; zl;WB8=C)X6-D_D^%TUWnl5aDQS#ni-xI<((t&+*1%xQDWF;NpSg2%GM^_2{$cf$8W9v+$m0hiEP19SOBnewUPsg9- z7;oMZh_DkpWz)xT+8`RGH+E84$@$ z4z#yImynM6hT&4Io&U|$e0}(jmgzYNUBYHb0qVJ*g6g@xk}ofJ70MrT;?|78ubvMo z=r&r40&b6_9kYR%`Iq57Djfw_ zm5_W{Q1E`>W`6bjS3%Xzj(>-RUp*&QF#k*RKV;!o&odQtS^{9T=P?VvdhV>C`dyH+ z-x|N&l3(?|@qo2na1 +#include +#include +#include "common.h" + +void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { + const float *A = (const float *)arg->addr_src; + float *C = (float *)arg->addr_dst; + + int incr = A[task_id]; + float sum = 0.0f; + float sum1 = 0.0f; + float sum2 = 0.0f; + float sum3 = 0.0f; + float sum4 = 0.0f; + float sum5 = 0.0f; +#pragma unroll 8 + for (int i = 0; i < 5000; i++) { + sum1 = sum2 + 5.0f; + sum2 = sum3 + 5.0f; + sum3 = sum4 + 5.0f; + sum4 = sum5 + 5.0f; + sum5 = sum1 + 5.0f; + } + + sum = sum1 + sum2 + sum3 + sum4 + sum5; + C[task_id] = static_cast(sum); +} + +int main() { + kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; + const uint32_t grid_size = arg->size; +#ifdef RADIANCE + vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#else + // NOTE: This kernel assumes contiguous thread scheduling for efficient shared + // memory allocation, and therefore does not work with original vx_spawn_tasks + vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#endif + return 0; +} diff --git a/tests/regression/flops/main.cpp b/tests/regression/flops/main.cpp new file mode 100644 index 00000000..72aa56ba --- /dev/null +++ b/tests/regression/flops/main.cpp @@ -0,0 +1,252 @@ +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.bin"; +uint32_t count = 0; + +std::vector src_data; +std::vector ref_data; + +vx_device_h device = nullptr; +std::vector staging_buf; +kernel_arg_t kernel_arg = {}; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h?")) != -1) { + switch (c) { + case 'n': + count = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + // vx_mem_free(device, kernel_arg.addr_a); + // vx_mem_free(device, kernel_arg.addr_b); + // vx_mem_free(device, kernel_arg.addr_c); + vx_dev_close(device); + } +} + +void generate_source_data(size_t size) { + src_data.resize(size); + + for (uint32_t i = 0; i < src_data.size(); ++i) { + src_data[i] = static_cast(i); + } +} + +void generate_reference_data(size_t size) { + ref_data.resize(size); + + for (uint32_t i = 0; i < ref_data.size(); ++i) { + ref_data[i] = static_cast(i) * 1000.0f; + } +} + +int run_test(const kernel_arg_t& kernel_arg, + uint32_t buf_size, + uint32_t size) { + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_dst, buf_size)); + + std::cout << "downloading result C matrix from device, device mem address=" + << std::hex << kernel_arg.addr_dst << ", size=" << std::dec + << buf_size << " bytes\n"; + std::ofstream file("output.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open output.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(staging_buf.data()), buf_size); + file.close(); + + std::ofstream ref_file("reference.bin", std::ios::binary | std::ios::out); + if (!ref_file) { + std::cerr << "error: failed to open reference.bin for writing\n"; + exit(EXIT_FAILURE); + } + ref_file.write(reinterpret_cast(ref_data.data()), buf_size); + ref_file.close(); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (float*)staging_buf.data(); + for (uint32_t i = 0; i < size; ++i) { + float ref = ref_data.at(i); + float cur = buf_ptr[i]; + if (std::abs((cur - ref) / ref) > 1e-6) { + std::cout << "error at result #" << std::dec << i + << std::hex << ": actual=" << cur << ", expected=" << ref << std::endl; + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + // parse command arguments + parse_args(argc, argv); + + if (count == 0) { + count = 1; + } + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + size_t size = 64; + + generate_source_data(size); + generate_reference_data(size); + + uint32_t src_buf_size = src_data.size() * sizeof(src_data[0]); + uint32_t dst_buf_size = ref_data.size() * sizeof(ref_data[0]); + + std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + // RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_src)); + // RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_dst)); + kernel_arg.addr_src = 0x20000UL; + kernel_arg.addr_dst = 0xc0000000UL; + kernel_arg.size = size; + + std::cout << "dev_addr_src=0x" << std::hex << kernel_arg.addr_src << std::endl; + std::cout << "dev_addr_dst=0x" << std::hex << kernel_arg.addr_dst << std::endl; + + // allocate staging buffer + { + std::cout << "allocate staging buffer" << std::endl; + uint32_t staging_buf_size = std::max( + src_buf_size, + std::max( + src_buf_size, + std::max(dst_buf_size, sizeof(kernel_arg_t)))); + staging_buf.resize(staging_buf_size); + } + + // upload kernel argument + { + std::cout << "upload kernel argument" << std::endl; + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + + std::cout << "uploading argument buffer to device, device mem address=" + << std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec + << sizeof(kernel_arg_t) << " bytes\n"; + std::ofstream file("args.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(staging_buf.data()), + sizeof(kernel_arg_t)); + file.close(); + } + + // upload source buffer + { + { + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, src_data.data(), src_data.size() * sizeof(float)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_src, staging_buf.data(), + src_buf_size)); + + std::cout << "uploading source data to device, device mem address=" + << std::hex << kernel_arg.addr_src << ", size=" << std::dec + << src_buf_size << " bytes\n"; + std::ofstream file("input.a.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open input.a.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), src_buf_size); + file.close(); + } + } + + // clear destination buffer + { + std::cout << "clear destination buffer" << std::endl; + auto buf_ptr = (int32_t*)staging_buf.data(); + for (uint32_t i = 0; i < ref_data.size(); ++i) { + buf_ptr[i] = 0xdeadbeef; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_dst, staging_buf.data(), dst_buf_size)); + } + + // run tests + std::cout << "run tests" << std::endl; + RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.size)); + std::cout << "PASSED!" << std::endl; + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + return 0; +} From 793779aa6cd0fe0e316ff455e3a5dbee4635a7a7 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 24 Apr 2024 21:08:31 -0700 Subject: [PATCH 3/4] sgemm_wg: 128x128 config --- tests/regression/sgemm_wg/kernel.cpp | 14 +++++++------- tests/regression/sgemm_wg/main.cpp | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index e9f898a0..86b7309d 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -16,11 +16,11 @@ // (BM*BN) / (TM*TN) == threadblock size >= NT * CORES_PER_CLUSTER // * Combining BM * BK >= (BM*BN) / (TM*TN) == threadblock yields // BM <= BK*TM*TN -#define BM 8 +#define BM 32 #define BN BM -#define BK 2 -#define TM 2 -#define TN 2 +#define BK 8 +#define TM 4 +#define TN 4 void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) { vx_fence(); @@ -80,14 +80,14 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, // // Make sure global offset values for A and B are contiguous between // neighboring threads to ensure GMEM coalescing. -// #pragma GCC unroll 1 +#pragma GCC unroll 2 for (uint32_t load_offset = 0; load_offset < BM; load_offset += stride_a) { const uint32_t global_a_offset = dim_k * (global_a_row + load_offset) + (k + local_a_col); local_a[BK * (local_a_row + load_offset) + local_a_col] = A[global_a_offset]; } -// #pragma GCC unroll 1 +#pragma GCC unroll 2 for (uint32_t load_offset = 0; load_offset < BK; load_offset += stride_b) { const uint32_t global_b_offset = dim_n * (k + local_b_row + load_offset) + global_b_col; @@ -99,7 +99,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, threadblock_dim_y); // Compute single tile*tile matmul -// #pragma GCC unroll 2 +#pragma GCC unroll 4 for (uint32_t local_k = 0; local_k < BK; local_k++) { // First, pump data from SMEM->RF #pragma GCC unroll TM diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index 709d804c..62625c44 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -166,9 +166,9 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_dev_open(&device)); // FIXME: hardcoded - uint32_t dim_m = 32; - uint32_t dim_n = 32; - uint32_t dim_k = 32; + uint32_t dim_m = 128; + uint32_t dim_n = 128; + uint32_t dim_k = 128; generate_source_matrix(dim_m, dim_n, dim_k); generate_reference_matmul(dim_m, dim_n, dim_k); From df881fd69f08028b1e2fdd37267145a0033c35d0 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 24 Apr 2024 21:09:01 -0700 Subject: [PATCH 4/4] Generate separate ELF for radiance --- tests/regression/common.mk | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/regression/common.mk b/tests/regression/common.mk index 81df3139..e90b3635 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -79,7 +79,7 @@ endif endif endif -all: $(PROJECT) kernel.bin kernel.dump kernel.radiance.dump +all: $(PROJECT) kernel.bin kernel.dump kernel.radiance.dump kernel.radiance.$(CONFIG).dump kernel.dump: kernel.elf $(VX_DP) -D kernel.elf > kernel.dump @@ -87,15 +87,21 @@ kernel.dump: kernel.elf kernel.radiance.dump: kernel.radiance.elf $(VX_DP) -D kernel.radiance.elf > kernel.radiance.dump +kernel.radiance.$(CONFIG).dump: kernel.radiance.$(CONFIG).elf + $(VX_DP) -D kernel.radiance.$(CONFIG).elf > kernel.radiance.$(CONFIG).dump + kernel.bin: kernel.elf kernel.radiance.elf $(VX_CP) -O binary kernel.elf kernel.bin kernel.elf: $(VX_SRCS) $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf -kernel.radiance.elf: $(VX_SRCS) +kernel.radiance.elf: kernel.elf $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -DRADIANCE -o kernel.radiance.elf +kernel.radiance.$(CONFIG).elf: kernel.radiance.elf + cp $< $@ + $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ @@ -122,7 +128,7 @@ clean: rm -rf $(PROJECT) *.o .depend clean-all: clean - rm -rf kernel.elf kernel.radiance.elf *.dump + rm -rf kernel.elf kernel.dump ifneq ($(MAKECMDGOALS),clean) -include .depend