fixes: texture unit mem access sometimes going to smem, bilinear texture filtering; new: cache req_id,

2021-11-24 00:00:17 -05:00
parent 1501360f4b
commit 18762dffce
70 changed files with 3818 additions and 1727 deletions
--- a/tests/regression/tex/Makefile
+++ b/tests/regression/tex/Makefile
@@ -9,8 +9,8 @@ VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
 VX_DP  = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
 VX_CP  = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy

-VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
-VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
+VX_CFLAGS += -std=c++11 -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
+VX_CFLAGS += -DENABLE_SW -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common

 VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a

@@ -21,7 +21,7 @@ CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors

 CXXFLAGS += -DLUPNG_USE_ZLIB

-CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
+CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common

 LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex  -lz

@@ -38,7 +38,7 @@ kernel.bin: kernel.elf
 	$(VX_CP) -O binary kernel.elf kernel.bin

 kernel.elf: $(VX_SRCS)
-	$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
+	$(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf

 $(PROJECT): $(SRCS)
 	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
--- a/tests/regression/tex/common.h
+++ b/tests/regression/tex/common.h
@@ -1,25 +1,27 @@
 #ifndef _COMMON_H_
 #define _COMMON_H_

+#include <VX_config.h>
+
 #define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000

 typedef struct {
-  uint32_t num_tasks;
-  uint8_t format;
-  uint8_t filter;
-  uint8_t wrap;
-  uint8_t use_sw;
-  uint32_t lod;
-  uint8_t src_logWidth;
-  uint8_t src_logHeight;
-  uint8_t src_stride;
-  uint8_t src_pitch;
-  uint32_t src_ptr;
-  uint32_t dst_width;
-  uint32_t dst_height;
-  uint8_t dst_stride;  
-  uint32_t dst_pitch;
-  uint32_t dst_ptr;  
+  bool      use_sw;
+  uint32_t  num_tasks;
+  uint8_t   format;
+  uint8_t   filter;
+  uint8_t   wrapu;
+  uint8_t   wrapv;  
+  uint8_t   src_logwidth;
+  uint8_t   src_logheight;
+  uint32_t  src_addr;
+  float     lod;
+  uint32_t  mip_offs[TEX_LOD_MAX+1];  
+  uint32_t  dst_width;
+  uint32_t  dst_height;
+  uint8_t   dst_stride;  
+  uint32_t  dst_pitch;
+  uint32_t  dst_addr;  
 } kernel_arg_t;

 #endif
--- a/tests/regression/tex/kernel.c
+++ b/tests/regression/tex/kernel.c
@@ -1,11 +1,9 @@
 #include <stdint.h>
 #include <vx_intrinsics.h>
 #include <vx_spawn.h>
-#include "common.h"
+#include <vx_print.h>
 #include "texsw.h"

-#define ENABLE_SW
-
 typedef struct {
  	kernel_arg_t* state;	
  	uint32_t tile_width;
@@ -14,29 +12,50 @@ typedef struct {
  	float deltaY;
 } tile_arg_t;

+template <typename T, T Start, T End>
+struct static_for_t {
+    template <typename Fn>
+    inline void operator()(const Fn& callback) const {
+        callback(Start);
+        static_for_t<T, Start+1, End>()(callback);
+    }
+};
+
+template <typename T, T N>
+struct static_for_t<T, N, N> {
+    template <typename Fn>
+    inline void operator()(const Fn& callback) const {}
+};
+
 void kernel_body(int task_id, tile_arg_t* arg) {
 	kernel_arg_t* state = arg->state;
 	
 	uint32_t xoffset = 0;
-	uint32_t yoffset = task_id * arg->tile_height;	
-	uint8_t* dst_ptr = (uint8_t*)(state->dst_ptr + xoffset * state->dst_stride + yoffset * state->dst_pitch);
+	uint32_t yoffset = task_id * arg->tile_height;

-	float fv = yoffset * arg->deltaY;
+	uint8_t* dst_ptr = (uint8_t*)(state->dst_addr + xoffset * state->dst_stride + yoffset * state->dst_pitch);
+
+	Fixed<16> xlod(state->lod);
+
+	/*vx_printf("task_id=%d, deltaX=%f, deltaY=%f, tile_width=%d, tile_height=%d\n", 
+		task_id, arg->deltaX, arg->deltaY, arg->tile_width, arg->tile_height);*/
+
+	float fv = (yoffset + 0.5f) * arg->deltaY;
 	for (uint32_t y = 0; y < arg->tile_height; ++y) {
 		uint32_t* dst_row = (uint32_t*)dst_ptr;
-		float fu = xoffset * arg->deltaX;
+		float fu = (xoffset + 0.5f) * arg->deltaX;
 		for (uint32_t x = 0; x < arg->tile_width; ++x) {
-			int32_t u = (int32_t)(fu * (1<<20));
-			int32_t v = (int32_t)(fv * (1<<20));
+			Fixed<TEX_FXD_FRAC> xu(fu);
+			Fixed<TEX_FXD_FRAC> xv(fv);
+			uint32_t color;
 		#ifdef ENABLE_SW
-			if (state->use_sw) {
-				dst_row[x] = (state->filter == 2) ? tex3_sw(state, 0, u, v, state->lod) : tex_sw(state, 0, u, v, state->lod);
-			} else {
-		#endif
-			dst_row[x] = (state->filter == 2) ? vx_tex3(0, u, v, state->lod) : vx_tex(0, u, v, state->lod);
-		#ifdef ENABLE_SW
-			}
+			if (state->use_sw)
+				color = tex_load_sw(state, xu, xv, xlod);
+			else
 		#endif
+			color = tex_load_hw(state, xu, xv, xlod);						
+			//vx_printf("task_id=%d, x=%d, y=%d, fu=%f, fv=%f, xu=0x%x, xv=0x%x, color=0x%x\n", task_id, x, y, fu, fv, xu.data(), xv.data(), color);			
+			dst_row[x] = color;
 			fu += arg->deltaX;
 		}
 		dst_ptr += state->dst_pitch;
@@ -48,13 +67,16 @@ int main() {
 	kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;

 	// configure texture unit
-	vx_csr_write(CSR_TEX_ADDR(0),   arg->src_ptr);
-	vx_csr_write(CSR_TEX_MIPOFF(0), 0);	
-	vx_csr_write(CSR_TEX_WIDTH(0),  arg->src_logWidth);
-	vx_csr_write(CSR_TEX_HEIGHT(0), arg->src_logHeight);
-	vx_csr_write(CSR_TEX_FORMAT(0), arg->format);
-	vx_csr_write(CSR_TEX_WRAP(0),   (arg->wrap << 2) | arg->wrap);
-	vx_csr_write(CSR_TEX_FILTER(0), (arg->filter ? 1 : 0));
+	csr_write(CSR_TEX(0, TEX_STATE_WIDTH),  arg->src_logwidth);	
+	csr_write(CSR_TEX(0, TEX_STATE_HEIGHT), arg->src_logheight);
+	csr_write(CSR_TEX(0, TEX_STATE_FORMAT), arg->format);
+	csr_write(CSR_TEX(0, TEX_STATE_WRAPU),  arg->wrapu);
+	csr_write(CSR_TEX(0, TEX_STATE_WRAPV),  arg->wrapv);
+	csr_write(CSR_TEX(0, TEX_STATE_FILTER), (arg->filter ? 1 : 0));
+	csr_write(CSR_TEX(0, TEX_STATE_ADDR),   arg->src_addr);
+	static_for_t<int, 0, TEX_LOD_MAX+1>()([&](int i) {
+		csr_write(CSR_TEX(0, TEX_STATE_MIPOFF(i)), arg->mip_offs[i]);
+	});	

 	tile_arg_t targ;
 	targ.state       = arg;
@@ -64,4 +86,9 @@ int main() {
 	targ.deltaY      = 1.0f / arg->dst_height;
 	
 	vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, &targ);
+	/*for (uint32_t t=0; t < arg->num_tasks; ++t) {		
+		kernel_body(t, &targ);
+	}*/
+
+	return 0;
 }
--- a/tests/regression/tex/main.cpp
+++ b/tests/regression/tex/main.cpp
@@ -25,10 +25,11 @@ const char* kernel_file = "kernel.bin";
 const char* input_file  = "palette64.png";
 const char* output_file = "output.png";
 int wrap    = 0;
-int filter  = 0;
+int filter  = 0;    // 0-> point, 1->bilinear, 2->trilinear
 float scale = 1.0f;
 int format  = 0;
 bool use_sw = false;
+float lod   = 1.0f;  // >= 1.0f 
 ePixelFormat eformat = FORMAT_A8R8G8B8;

 vx_device_h device = nullptr;
@@ -36,7 +37,7 @@ vx_buffer_h buffer = nullptr;

 static void show_usage() {
   std::cout << "Vortex Texture Test." << std::endl;
-   std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-z no_hw] [-h: help]" << std::endl;
+   std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-l lod] [-z no_hw] [-h: help]" << std::endl;
 }

 static void parse_args(int argc, char **argv) {
@@ -55,6 +56,9 @@ static void parse_args(int argc, char **argv) {
    case 'w':
      wrap = std::atoi(optarg);
      break;
+    case 'l':
+      lod = std::stof(optarg, NULL);
+      break;
    case 'z':
      use_sw = true;
      break;
@@ -118,7 +122,7 @@ int run_test(const kernel_arg_t& kernel_arg,

  // download destination buffer
  std::cout << "download destination buffer" << std::endl;
-  RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
+  RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_addr, buf_size, 0));

  std::vector<uint8_t> dst_pixels(buf_size);
  auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
@@ -137,25 +141,39 @@ int run_test(const kernel_arg_t& kernel_arg,
 int main(int argc, char *argv[]) {
  kernel_arg_t kernel_arg;  
  std::vector<uint8_t> src_pixels;
+  std::vector<uint32_t> mip_offsets;
  uint32_t src_width;
  uint32_t src_height;
  
  // parse command arguments
  parse_args(argc, argv);

-  RT_CHECK(LoadImage(input_file, eformat, src_pixels, &src_width, &src_height));
+  {
+    std::vector<uint8_t> staging;  
+    RT_CHECK(LoadImage(input_file, eformat, staging, &src_width, &src_height));  
+    
+    RT_CHECK(GenerateMipmaps(src_pixels, mip_offsets, staging, eformat, src_width, src_height));
+
+    //uint32_t src_bpp = Format::GetInfo(eformat).BytePerPixel;  
+    //dump_image(src_pixels, src_pixels.size() / src_bpp, 1, src_bpp);
+  }

  // check power of two support
-  if (!ISPOW2(src_width) || !ISPOW2(src_height)) {
+  if (!ispow2(src_width) || !ispow2(src_height)) {
    std::cout << "Error: only power of two textures supported: width=" << src_width << ", heigth=" << src_height << std::endl;
    return -1;
  }

-  uint32_t src_bpp = Format::GetInfo(eformat).BytePerPixel;
-  
-  //dump_image(src_pixels, src_width, src_height, src_bpp);
+  uint32_t src_logwidth  = log2ceil(src_width);
+  uint32_t src_logheight = log2ceil(src_height);

-  uint32_t src_bufsize = src_bpp * src_width * src_height;
+  uint32_t src_max_lod = std::max(src_logwidth, src_logheight);
+  if (lod > src_max_lod) {
+    std::cout << "Error: out-of-bound level-of-detail: lod=" << lod << ", source image=" << src_max_lod << std::endl;
+    return -1;
+  }
+
+  uint32_t src_bufsize = src_pixels.size();

  uint32_t dst_width   = (uint32_t)(src_width * scale);
  uint32_t dst_height  = (uint32_t)(src_height * scale);
@@ -183,7 +201,7 @@ int main(int argc, char *argv[]) {

  // allocate device memory
  std::cout << "allocate device memory" << std::endl;  
-  size_t src_addr, dst_addr;
+  uint64_t src_addr, dst_addr;
  RT_CHECK(vx_alloc_dev_mem(device, src_bufsize, &src_addr));
  RT_CHECK(vx_alloc_dev_mem(device, dst_bufsize, &dst_addr));

@@ -192,32 +210,37 @@ int main(int argc, char *argv[]) {

  // allocate staging shared memory  
  std::cout << "allocate shared memory" << std::endl;    
-  uint32_t alloc_size = std::max<uint32_t>(sizeof(kernel_arg_t), std::max<uint32_t>(src_bufsize, dst_bufsize));
+  uint32_t alloc_size = std::max<uint32_t>(sizeof(kernel_arg_t), 
+                            std::max<uint32_t>(src_bufsize, dst_bufsize));
  RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer));
  
  // upload kernel argument
  std::cout << "upload kernel argument" << std::endl;
  {
+    kernel_arg.use_sw     = use_sw;
    kernel_arg.num_tasks  = std::min<uint32_t>(num_tasks, dst_height);
    kernel_arg.format     = format;
    kernel_arg.filter     = filter;
-    kernel_arg.wrap       = wrap;
-    kernel_arg.use_sw     = use_sw;
-    kernel_arg.lod        = 0x0;
+    kernel_arg.wrapu      = wrap;
+    kernel_arg.wrapv      = wrap;    
    
-    kernel_arg.src_logWidth  = (uint32_t)std::log2(src_width);
-    kernel_arg.src_logHeight = (uint32_t)std::log2(src_height);
-    kernel_arg.src_stride = src_bpp;
-    kernel_arg.src_pitch  = src_bpp * src_width;
-    kernel_arg.src_ptr    = src_addr;
+    kernel_arg.src_logwidth  = src_logwidth;
+    kernel_arg.src_logheight = src_logheight;
+    kernel_arg.src_addr      = src_addr;
+    kernel_arg.lod           = lod;
+
+    for (uint32_t i = 0; i < mip_offsets.size(); ++i) {
+      assert(i < TEX_LOD_MAX);
+      kernel_arg.mip_offs[i] = mip_offsets.at(i); 
+    }

    kernel_arg.dst_width  = dst_width;
    kernel_arg.dst_height = dst_height;
    kernel_arg.dst_stride = dst_bpp;
    kernel_arg.dst_pitch  = dst_bpp * dst_width;    
-    kernel_arg.dst_ptr    = dst_addr;
+    kernel_arg.dst_addr   = dst_addr;

-    auto buf_ptr = (int*)vx_host_ptr(buffer);
+    auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
    RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
  }
@@ -225,21 +248,21 @@ int main(int argc, char *argv[]) {
  // upload source buffer
  std::cout << "upload source buffer" << std::endl;      
  {    
-    auto buf_ptr = (int8_t*)vx_host_ptr(buffer);
+    auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
    for (uint32_t i = 0; i < src_bufsize; ++i) {
      buf_ptr[i] = src_pixels[i];
    }      
-    RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_ptr, src_bufsize, 0));
+    RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_addr, src_bufsize, 0));
  }

  // clear destination buffer
  std::cout << "clear destination buffer" << std::endl;      
  {    
-    auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
+    auto buf_ptr = (uint32_t*)vx_host_ptr(buffer);
    for (uint32_t i = 0; i < (dst_bufsize/4); ++i) {
      buf_ptr[i] = 0xdeadbeef;
    }    
-    RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, dst_bufsize, 0));  
+    RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_addr, dst_bufsize, 0));  
  }

  // run tests
--- a/tests/regression/tex/texsw.h
+++ b/tests/regression/tex/texsw.h
@@ -1,167 +1,122 @@
-#ifndef _TEXSW_H_
+#pragma once

+#include <vx_intrinsics.h>
+#include <texturing.h>
 #include "common.h"

-#define TEX_LOD_MAX 11
-
-#define MIN(x, y)   ((x < y) ? (x) : (y))
-
-#define MAX(x, y)   ((x > y) ? (x) : (y))
-
-inline int address(int wrap, int value) {
-    switch (wrap) {
-    case 1: return value & 0xfffff;
-    default:
-    case 0: return MIN(MAX(value, 0), 0xfffff);
+inline uint32_t texel_read(uint8_t* address, uint32_t stride) {
+    switch (stride) {
+    case 1: return *(uint8_t*)address;
+    case 2: return *(uint16_t*)address;
+    case 4: return *(uint32_t*)address;
+    default: 
+        std::abort();
+        return 0;
    }
 }

-inline void unpack(int format, int value, int* l, int* h) {
-    switch (format) {
-    case 1:
-    case 2:
-        *l = value;
-        *h = 0;
-        break;
-    case 3:
-        *l = (value | (value << 8)) & 0x00ff00ff;
-        *h = 0;
-        break;
-    case 4:
-        *l = (value | (value << 16)) & 0x07e0f81f;
-        *h = 0;
-        break;
-    case 5:
-        *l = (value | (value << 12)) & 0x0f0f0f0f;
-        *h = 0;
-        break;
-    default:
-    case 0: 
-        *l = value & 0x00ff00ff;
-        *h = (value >> 8) & 0x00ff00ff;
-        break;
-    }
-}
+inline uint32_t vx_tex_sw(kernel_arg_t* state, 
+                          Fixed<TEX_FXD_FRAC> xu, 
+                          Fixed<TEX_FXD_FRAC> xv, 
+                          uint32_t lod) {
+    uint8_t* base_addr  = ((uint8_t*)state->src_addr) + state->mip_offs[lod];
+	uint32_t log_width  = std::max<int32_t>(state->src_logwidth - lod, 0);
+	uint32_t log_height = std::max<int32_t>(state->src_logheight - lod, 0);
+	auto format = (TexFormat)state->format;
+	auto wrapu  = (WrapMode)state->wrapu;
+    auto wrapv  = (WrapMode)state->wrapv;
+	auto filter = state->filter;
+    auto stride = Stride(format);    

-inline void lerp(int al, int ah, int bl, int bh, int frac, int* l, int* h) {
-    *l = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff;
-    *h = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff;
-}
-
-inline int pack(int format, int l, int h) {
-    switch (format) {
-    case 1:
-    case 2:
-        return l;
-    case 3:
-        return (l | (l >> 8)) & 0xffff;
-    case 4:
-         return (l | (l >> 16)) & 0xffff;
-    case 5:
-        return (l | (l >> 12)) & 0xffff;
-    default:
-    case 0: 
-        return (h << 8) | l;
-    }
-}
-
-inline int tex_sw(kernel_arg_t* state, int stage, int u, int v, int lod) {
-    int base_addr  = state->src_ptr;
-	int mip_offset = 0;
-	int log_width  = state->src_logWidth;
-	int log_height = state->src_logHeight;
-	int format     = state->format;
-	int wrap       = state->wrap;
-	int filter     = state->filter;
-
-    int32_t* pBits = ((uint32_t*)base_addr) + mip_offset;    
+    uint32_t color;

    if (filter) {
-        int u0 = address(wrap, u - (0x80000 >> log_width));
-        int v0 = address(wrap, v - (0x80000 >> log_height)); 
-        int u1 = address(wrap, u + (0x80000 >> log_width));    
-        int v1 = address(wrap, v + (0x80000 >> log_height));
+        // addressing
+        uint32_t offset00, offset01, offset10, offset11;
+        uint32_t alpha, beta;
+        TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv, 
+            &offset00, &offset01, &offset10, &offset11, &alpha, &beta);

-        int x0 = u0 >> (20 - log_width);
-        int y0 = v0 >> (20 - log_height);
-        int x1 = u1 >> (20 - log_width);
-        int y1 = v1 >> (20 - log_height); 
+        uint8_t* addr00 = base_addr + offset00 * stride;
+        uint8_t* addr01 = base_addr + offset01 * stride;
+        uint8_t* addr10 = base_addr + offset10 * stride;
+        uint8_t* addr11 = base_addr + offset11 * stride;

        // memory lookup
-
-        int c0 = pBits[x0 + (y0 << log_width)];
-        int c1 = pBits[x1 + (y0 << log_width)];
-        int c2 = pBits[x0 + (y1 << log_width)];
-        int c3 = pBits[x1 + (y1 << log_width)];
+        uint32_t texel00 = texel_read(addr00, stride);
+        uint32_t texel01 = texel_read(addr01, stride);
+        uint32_t texel10 = texel_read(addr10, stride);
+        uint32_t texel11 = texel_read(addr11, stride);

        // filtering
-
-        int alpha = x0 & 0xff;
-        int beta  = y0 & 0xff;
-
-        int c0a, c0b;  
-        int c1a, c1b;
-        int c01a, c01b;
-
-        unpack(format, c0, &c0a, &c0b);
-        unpack(format, c1, &c1a, &c1b);
-        lerp(c0a, c0b, c1a, c1b, alpha, &c01a, &c01b);
-
-        int c2a, c2b;  
-        int c3a, c3b;
-        int c23a, c23b;
-
-        unpack(format, c2, &c2a, &c2b);
-        unpack(format, c3, &c3a, &c3b);
-        lerp(c2a, c2b, c3a, c3b, alpha, &c23a, &c23b);
-
-        int c4a, c4b;
-        lerp(c01a, c01b, c23a, c23b, beta, &c4a, &c4b);
-        return pack(format, c4a, c4b);
+        color = TexFilterLinear(
+            format, texel00, texel01, texel10, texel11, alpha, beta);
    } else {
-        int u0 = address(wrap, u);
-        int v0 = address(wrap, v);  
+        // addressing
+        uint32_t offset;
+        TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
+        
+        uint8_t* addr = base_addr + offset * stride;
+        
+        // memory lookup
+        uint32_t texel = texel_read(addr, stride);

-        int x0 = u0 >> (20 - log_width);
-        int y0 = v0 >> (20 - log_height);  
-
-        int c0 = pBits[x0 + (y0 <<log_width)];
-
-        int c0a, c0b;  
-        unpack(format, c0, &c0a, &c0b);
-        return pack(format, c0a, c0b);
+        // filtering
+        color = TexFilterPoint(format, texel);
    }
+    return color;
 }

-inline int vx_tex3(int stage, int u, int v, int lod) {
-    int lodn = MIN(lod + 0x100000, TEX_LOD_MAX);
-    int a = vx_tex(0, u, v, lod);  
-    int b = vx_tex(0, u, v, lodn);  
-    int al = a & 0x00ff00ff;
-    int ah = (a >> 8) & 0x00ff00ff;    
-    int bl = b & 0x00ff00ff;
-    int bh = (b >> 8) & 0x00ff00ff;
-    int frac = (lod >> 12) & 0xff;
-    int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff;
-    int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff;
-    int c = al | (ah << 8);
-    return c;
+inline uint32_t tex_load_hw(kernel_arg_t* state, 
+                            Fixed<TEX_FXD_FRAC> xu, 
+                            Fixed<TEX_FXD_FRAC> xv, 
+                            Fixed<16> xlod) {
+    uint32_t color;
+    int32_t ilod = std::max<int32_t>(xlod.data(), Fixed<16>::ONE);
+    uint32_t lod = std::min<uint32_t>(log2floor(ilod) - 16, TEX_LOD_MAX);
+    if (state->filter == 2) {        
+        uint32_t lod_n  = std::min<uint32_t>(lod + 1, TEX_LOD_MAX);
+        uint32_t frac   = ilod >> (lod + 16 - 8);
+        uint32_t texel0 = vx_tex(0, xu.data(), xv.data(), lod); 
+        uint32_t texel1 = vx_tex(0, xu.data(), xv.data(), lod_n);
+        uint32_t cl, ch;
+        {
+            uint32_t c0l, c0h;  
+            uint32_t c1l, c1h;
+            Unpack8888(TexFormat::R8G8B8A8, texel0, &c0l, &c0h);
+            Unpack8888(TexFormat::R8G8B8A8, texel1, &c1l, &c1h);
+            Lerp8888(c0l, c0h, c1l, c1h, frac, &cl, &ch);
+        }
+        color = Pack8888(TexFormat::R8G8B8A8, cl, ch);
+    } else {
+        color = vx_tex(0, xu.data(), xv.data(), lod);
+    }
+    return color;
 }

-inline int tex3_sw(kernel_arg_t* state, int stage, int u, int v, int lod) {
-    int lodn = MIN(lod + 0x10000, TEX_LOD_MAX);
-    int a = tex_sw(state, 0, u, v, lod);    
-    int b = tex_sw(state, 0, u, v, lodn);
-    int al = a & 0x00ff00ff;
-    int ah = (a >> 8) & 0x00ff00ff;
-    
-    int bl = b & 0x00ff00ff;
-    int bh = (b >> 8) & 0x00ff00ff;
-    int frac = (lod >> 12) & 0xff;
-    int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff;
-    int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff;
-    int c = al | (ah << 8);
-    return c;
-}
-
-#endif
+inline uint32_t tex_load_sw(kernel_arg_t* state, 
+                            Fixed<TEX_FXD_FRAC> xu, 
+                            Fixed<TEX_FXD_FRAC> xv, 
+                            Fixed<16> xlod) {
+    uint32_t color;
+    int32_t ilod = std::max<int32_t>(xlod.data(), Fixed<16>::ONE);
+    uint32_t lod = std::min<uint32_t>(log2floor(ilod) - 16, TEX_LOD_MAX);
+    if (state->filter == 2) {        
+        uint32_t lod_n  = std::min<uint32_t>(lod + 1, TEX_LOD_MAX);
+        uint32_t frac   = ilod >> (lod + 16 - 8);
+        uint32_t texel0 = vx_tex_sw(state, xu, xv, lod);  
+        uint32_t texel1 = vx_tex_sw(state, xu, xv, lod_n);
+        uint32_t cl, ch;
+        {
+            uint32_t c0l, c0h;  
+            uint32_t c1l, c1h;
+            Unpack8888(TexFormat::R8G8B8A8, texel0, &c0l, &c0h);
+            Unpack8888(TexFormat::R8G8B8A8, texel1, &c1l, &c1h);
+            Lerp8888(c0l, c0h, c1l, c1h, frac, &cl, &ch);
+        }
+        color = Pack8888(TexFormat::R8G8B8A8, cl, ch);
+    } else {
+        color = vx_tex_sw(state, xu, xv, lod);
+    }
+    return color;
+}
--- a/tests/regression/tex/utils.cpp
+++ b/tests/regression/tex/utils.cpp
@@ -191,4 +191,112 @@ int ConvertImage(std::vector<uint8_t>& dst_pixels,
  SurfaceDesc dstDesc{dst_format, dst_pixels.data(), width, height, dst_pitch};

  return CopyBuffers(dstDesc, 0, 0, width, height, srcDesc, 0, 0);
+}
+
+
+
+int GenerateMipmaps(std::vector<uint8_t>& dst_pixels,
+                    std::vector<uint32_t>& mip_offsets,
+                    const std::vector<uint8_t>& src_pixels,
+                    ePixelFormat format,
+                    uint32_t src_width,
+                    uint32_t src_height) {
+  std::vector<uint8_t> src_staging, dst_staging;
+  const std::vector<uint8_t> *pSrcPixels;
+  std::vector<uint8_t> *pDstPixels;
+
+  // convert source image if needed
+  bool need_conversion = (format != FORMAT_A8R8G8B8);
+  if (need_conversion) {
+    ConvertImage(src_staging, src_pixels, src_width, src_height, format, FORMAT_A8R8G8B8);
+    pSrcPixels = &src_staging;
+    pDstPixels = &dst_staging;
+  } else {
+    pSrcPixels = &src_pixels;
+    pDstPixels = &dst_pixels;
+  }
+
+  uint32_t src_logwidth  = log2ceil(src_width);
+  uint32_t src_logheight = log2ceil(src_height);
+  uint32_t max_lod       = std::max(src_logwidth, src_logheight) + 1;
+
+  mip_offsets.resize(max_lod);
+
+  // Calculate mipmaps buffer size
+  uint32_t dst_height = 1;
+  uint32_t dst_width = 0;
+  for (uint32_t lod = 0, w = src_width, h = src_height; lod < max_lod; ++lod) {
+    assert((w > 0) || (w > 0));
+    uint32_t pw = std::max<int>(w, 1);
+    uint32_t ph = std::max<int>(h, 1);
+    mip_offsets.at(lod) = dst_width;
+    dst_width += pw * ph;
+    w >>= 1;
+    h >>= 1;
+  }
+
+  // allocate mipmap
+  pDstPixels->resize(dst_width * 4);
+
+  // generate mipmaps  
+  {
+    auto pSrc = reinterpret_cast<const uint32_t*>(pSrcPixels->data());
+    auto pDst = reinterpret_cast<uint32_t*>(pDstPixels->data());
+
+    // copy level 0
+    memcpy(pDst, pSrc, pSrcPixels->size());
+    assert(pSrcPixels->size() == 4 * src_width * src_height);
+    pSrc = pDst;
+    pDst += src_width * src_height;    
+
+    // copy lower levels
+    for (uint32_t lod = 1, w = (src_width/2), h = (src_height/2); lod < max_lod;) {
+      assert((w > 0) || (w > 0));
+      uint32_t pw = std::max<int>(w, 1);
+      uint32_t ph = std::max<int>(h, 1);
+      for (uint32_t y = 0; y < pw; ++y) {
+        auto v0 = 2 * y;
+        auto v1 = 2 * y + ((ph > 1) ? 1 : 0);
+        auto pSrc0 = pSrc + v0 * (2 * pw);
+        auto pSrc1 = pSrc + v1 * (2 * pw);
+
+        for (uint32_t x = 0; x <pw; ++x) {
+          auto u0 = 2 * x;
+          auto u1 = 2 * x + ((pw > 1) ? 1 : 0);
+
+          auto c00 = Format::ConvertFrom<FORMAT_A8R8G8B8, false>(pSrc0 + u0);
+          auto c01 = Format::ConvertFrom<FORMAT_A8R8G8B8, false>(pSrc0 + u1);
+          auto c10 = Format::ConvertFrom<FORMAT_A8R8G8B8, false>(pSrc1 + u0);
+          auto c11 = Format::ConvertFrom<FORMAT_A8R8G8B8, false>(pSrc1 + u1);
+
+          const ColorARGB color((c00.a + c01.a + c10.a + c11.a+2) >> 2,
+                                (c00.r + c01.r + c10.r + c11.r+2) >> 2,
+                                (c00.g + c01.g + c10.g + c11.g+2) >> 2,
+                                (c00.b + c01.b + c10.b + c11.b+2) >> 2);
+                                
+          uint32_t ncolor;
+          Format::ConvertTo<FORMAT_A8R8G8B8>(&ncolor, color);
+          pDst[x + y * pw] = ncolor;
+        }
+      } 
+      ++lod; 
+      pSrc = pDst;
+      pDst += pw * ph;
+      w >>= 1;
+      h >>= 1;  
+    }
+    assert((pDst - reinterpret_cast<uint32_t*>(pDstPixels->data())) == dst_width);
+  }
+
+  // convert destination image if needed
+  if (need_conversion) {
+    ConvertImage(dst_staging, dst_staging, dst_width, dst_height, FORMAT_A8R8G8B8, format);
+  }
+
+  uint32_t bpp = Format::GetInfo(format).BytePerPixel;
+  for (auto& offset : mip_offsets) {
+    offset *= bpp;
+  }
+
+  return 0;
 }
--- a/tests/regression/tex/utils.h
+++ b/tests/regression/tex/utils.h
@@ -1,14 +1,9 @@
 #include <cstdint>
 #include <vector>
 #include <iostream>
+#include <bitmanip.h>
 #include "surfacedesc.h"

-#define ISPOW2(x)   (((x) != 0) && (0 == ((x) & ((x) - 1))))
-
-inline uint32_t ilog2 (uint32_t value) {
-  	return (uint32_t)(sizeof(uint32_t) * 8UL) - (uint32_t)__builtin_clzl((value << 1) - 1UL) - 1;
-}
-
 int LoadImage(const char *filename,
              ePixelFormat format,
              std::vector<uint8_t> &pixels,
@@ -37,7 +32,14 @@ int ConvertImage(std::vector<uint8_t>& dst_pixels,
                 ePixelFormat src_format,
                 ePixelFormat dst_format);

+int GenerateMipmaps(std::vector<uint8_t>& dst_pixels,
+                    std::vector<uint32_t>& mip_offsets,
+                    const std::vector<uint8_t>& src_pixels,
+                    ePixelFormat format,
+                    uint32_t src_width,
+                    uint32_t src_height);
+
 void dump_image(const std::vector<uint8_t>& pixels, 
                uint32_t width, 
                uint32_t height, 
-                uint32_t bpp);
+                uint32_t bpp);