matrix fixed
This commit is contained in:
34
perflab/matrix/Makefile
Normal file
34
perflab/matrix/Makefile
Normal file
@@ -0,0 +1,34 @@
|
||||
CC = gcc
|
||||
CFLAGS = -Wall -O1 -g
|
||||
#LDFLAGS = -lm -lcudart -lcuda
|
||||
|
||||
# Source files
|
||||
SRCS = rowcol_test.c clock.c cpe.c fcyc.c lsquare.c rowcol_202302723005.c
|
||||
#CUDA_SRCS = rowcol.cu
|
||||
OBJS = $(SRCS:.c=.o)
|
||||
#rowcol.o
|
||||
|
||||
# Target executable
|
||||
TARGET = matrix_test
|
||||
|
||||
# Default target
|
||||
all: $(TARGET)
|
||||
|
||||
# Rule to build the executable
|
||||
$(TARGET): $(OBJS)
|
||||
$(CC) $(OBJS) -o $(TARGET) $(LDFLAGS)
|
||||
|
||||
# Rule to build object files
|
||||
%.o: %.c
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
# Rule to build CUDA object files
|
||||
#rowcol.o: rowcol.cu
|
||||
# $(NVCC) $(CUDA_FLAGS) -c $< -o $@
|
||||
|
||||
# Clean rule
|
||||
clean:
|
||||
rm -f $(OBJS) $(TARGET)
|
||||
|
||||
# Phony targets
|
||||
.PHONY: all clean
|
||||
@@ -1,229 +1,196 @@
|
||||
/* clock.c
|
||||
* Retrofitted to use thread-specific timers
|
||||
* and to get clock information from /proc/cpuinfo
|
||||
* (C) R. E. Bryant, 2010
|
||||
*
|
||||
*/
|
||||
|
||||
/* When this constant is not defined, uses time stamp counter */
|
||||
#define USE_POSIX 0
|
||||
|
||||
/* Choice to use cpu_gettime call or Intel time stamp counter directly */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <intrin.h>
|
||||
//#include <intrinsics.h>
|
||||
#include <windows.h>
|
||||
#include <time.h>
|
||||
#include "clock.h"
|
||||
|
||||
/* Use x86 cycle counter */
|
||||
|
||||
/* Initialize the cycle counter */
|
||||
static unsigned cyc_hi = 0;
|
||||
static unsigned cyc_lo = 0;
|
||||
|
||||
/* Set *hi and *lo to the high and low order bits of the cycle counter.
|
||||
Implementation requires assembly code to use the rdtsc instruction. */
|
||||
void access_counter(unsigned *hi, unsigned *lo)
|
||||
{
|
||||
|
||||
long long counter;
|
||||
|
||||
counter = __rdtsc();
|
||||
(*hi) = (unsigned int)(counter >> 32);
|
||||
(*lo) = (unsigned int)counter;
|
||||
/*
|
||||
|
||||
LARGE_INTEGER lPerformanceCount;
|
||||
|
||||
QueryPerformanceCounter(&lPerformanceCount);
|
||||
(*hi) = (unsigned int)lPerformanceCount.HighPart;
|
||||
(*lo) = (unsigned int)lPerformanceCount.LowPart;
|
||||
// printf("%08X %08X\n",(*hi),(*lo));
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
/* Record the current value of the cycle counter. */
|
||||
void start_counter()
|
||||
{
|
||||
access_counter(&cyc_hi, &cyc_lo);
|
||||
}
|
||||
|
||||
/* Return the number of cycles since the last call to start_counter. */
|
||||
double get_counter()
|
||||
{
|
||||
unsigned ncyc_hi, ncyc_lo;
|
||||
unsigned hi, lo, borrow;
|
||||
double result;
|
||||
|
||||
/* Get cycle counter */
|
||||
access_counter(&ncyc_hi, &ncyc_lo);
|
||||
|
||||
/* Do double precision subtraction */
|
||||
lo = ncyc_lo - cyc_lo;
|
||||
borrow = cyc_lo > ncyc_lo;
|
||||
hi = ncyc_hi - cyc_hi - borrow;
|
||||
result = (double) hi * (1 << 30) * 4 + lo;
|
||||
return result;
|
||||
}
|
||||
void make_CPU_busy(void)
|
||||
{
|
||||
volatile double old_tick,new_tick;
|
||||
start_counter();
|
||||
old_tick = get_counter();
|
||||
new_tick = get_counter();
|
||||
while (new_tick - old_tick < 1000000000)
|
||||
new_tick = get_counter();
|
||||
}
|
||||
|
||||
//CPU的频率
|
||||
double mhz(int verbose)
|
||||
{
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER lPerformanceCount_Start;
|
||||
LARGE_INTEGER lPerformanceCount_End;
|
||||
double mhz;
|
||||
double fTime;
|
||||
__int64 _i64StartCpuCounter;
|
||||
__int64 _i64EndCpuCounter;
|
||||
//On a multiprocessor machine, it should not matter which processor is called.
|
||||
//However, you can get different results on different processors due to bugs in
|
||||
//the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function.
|
||||
HANDLE hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
|
||||
//主板上高精度定时器的晶振频率
|
||||
//这个定时器应该就是一片8253或者8254
|
||||
//在intel ich7中集成了8254
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
// if (verbose>0)
|
||||
// printf("高精度定时器的晶振频率:%1.0fHz.\n",(double)lFrequency.QuadPart);
|
||||
|
||||
//这个定时器每经过一个时钟周期,其计数器会+1
|
||||
QueryPerformanceCounter(&lPerformanceCount_Start);
|
||||
|
||||
//RDTSC指令:获取CPU经历的时钟周期数
|
||||
_i64StartCpuCounter=__rdtsc();
|
||||
|
||||
//延时长一点,误差会小一点
|
||||
//int nTemp=100000;
|
||||
//while (--nTemp);
|
||||
Sleep(200);
|
||||
|
||||
QueryPerformanceCounter(&lPerformanceCount_End);
|
||||
|
||||
_i64EndCpuCounter=__rdtsc();
|
||||
|
||||
//f=1/T => f=计数次数/(计数次数*T)
|
||||
//这里的“计数次数*T”就是时间差
|
||||
fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart)
|
||||
/(double)lFrequency.QuadPart;
|
||||
|
||||
mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0);
|
||||
if (verbose>0)
|
||||
printf("CPU频率为:%1.6fMHz.\n",mhz);
|
||||
return mhz;
|
||||
}
|
||||
|
||||
double CPU_Factor1(void)
|
||||
{
|
||||
double result;
|
||||
int i,j,k,ii,jj,kk;
|
||||
LARGE_INTEGER lStart,lEnd;
|
||||
LARGE_INTEGER lFrequency;
|
||||
HANDLE hThread;
|
||||
double fTime;
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
|
||||
ii = 43273;
|
||||
kk = 1238;
|
||||
result = 1;
|
||||
jj = 1244;
|
||||
|
||||
hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
QueryPerformanceCounter(&lStart);
|
||||
//_asm("cpuid");
|
||||
start_counter();
|
||||
for (i=0;i<100;i++)
|
||||
for (j=0;j<1000;j++)
|
||||
for (k=0;k<1000;k++)
|
||||
kk += kk*ii+jj;
|
||||
|
||||
result = get_counter();
|
||||
QueryPerformanceCounter(&lEnd);
|
||||
fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart);
|
||||
printf("CPU运行时间为%f",result);
|
||||
printf("\t %f\n",fTime);
|
||||
return result;
|
||||
}
|
||||
|
||||
double CPU_Factor(void)
|
||||
{
|
||||
double frequency;
|
||||
double multiplier = 1000 * 1000 * 1000;//nano
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER start,stop;
|
||||
HANDLE hThread;
|
||||
int i;
|
||||
const int gigahertz= 1000*1000*1000;
|
||||
const int known_instructions_per_loop = 27317;
|
||||
|
||||
int iterations = 100000000;
|
||||
int g = 0;
|
||||
double normal_ticks_per_second;
|
||||
double ticks;
|
||||
double time;
|
||||
double loops_per_sec;
|
||||
double instructions_per_loop;
|
||||
double ratio;
|
||||
double actual_freq;
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
frequency = (double)lFrequency.QuadPart;
|
||||
|
||||
hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
QueryPerformanceCounter(&start);
|
||||
for( i = 0; i < iterations; i++)
|
||||
{
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
}
|
||||
QueryPerformanceCounter(&stop);
|
||||
|
||||
//normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199
|
||||
normal_ticks_per_second = frequency * 1000;
|
||||
ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
|
||||
time = (ticks * multiplier) /frequency;
|
||||
loops_per_sec = iterations / (time/multiplier);
|
||||
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
|
||||
|
||||
ratio = (instructions_per_loop / known_instructions_per_loop);
|
||||
actual_freq = normal_ticks_per_second / ratio;
|
||||
/*
|
||||
actual_freq = normal_ticks_per_second / ratio;
|
||||
actual_freq = known_instructions_per_loop*iterations*multiplier/time;
|
||||
|
||||
2293 = x/time;
|
||||
|
||||
2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
|
||||
loops_per_sec = iterations*frequency / ticks
|
||||
|
||||
instructions_per_loop = / loops_per_sec;
|
||||
*/
|
||||
printf("Perf counter freq: %f\n", normal_ticks_per_second);
|
||||
printf("Loops per sec: %f\n", loops_per_sec);
|
||||
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
|
||||
printf("Presumed freq: %f\n", actual_freq);
|
||||
printf("ratio: %f\n", ratio);
|
||||
printf("time=%f\n",time);
|
||||
return ratio;
|
||||
}
|
||||
/* clock.c
|
||||
* Retrofitted to use thread-specific timers
|
||||
* and to get clock information from /proc/cpuinfo
|
||||
* (C) R. E. Bryant, 2010
|
||||
* Modified for cross-platform compatibility
|
||||
*/
|
||||
|
||||
#define _GNU_SOURCE // For sched_setaffinity on Linux
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <intrin.h>
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <sched.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <x86intrin.h>
|
||||
typedef struct {
|
||||
uint64_t QuadPart;
|
||||
} LARGE_INTEGER;
|
||||
typedef void *HANDLE;
|
||||
#define __int64 long long
|
||||
#define Sleep(ms) usleep((ms) * 1000)
|
||||
#endif
|
||||
|
||||
#include "clock.h"
|
||||
|
||||
/* Use x86 cycle counter */
|
||||
static unsigned cyc_hi = 0;
|
||||
static unsigned cyc_lo = 0;
|
||||
|
||||
void access_counter(unsigned *hi, unsigned *lo) {
|
||||
uint64_t counter = __rdtsc();
|
||||
*hi = (unsigned)(counter >> 32);
|
||||
*lo = (unsigned)counter;
|
||||
}
|
||||
|
||||
void start_counter() { access_counter(&cyc_hi, &cyc_lo); }
|
||||
|
||||
double get_counter() {
|
||||
unsigned ncyc_hi, ncyc_lo;
|
||||
access_counter(&ncyc_hi, &ncyc_lo);
|
||||
uint64_t start = ((uint64_t)cyc_hi << 32) | cyc_lo;
|
||||
uint64_t end = ((uint64_t)ncyc_hi << 32) | ncyc_lo;
|
||||
return (double)(end - start);
|
||||
}
|
||||
|
||||
void make_CPU_busy(void) {
|
||||
volatile double old_tick = get_counter();
|
||||
volatile double new_tick;
|
||||
while ((new_tick - old_tick) < 1000000000) {
|
||||
new_tick = get_counter();
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
#define GET_TIME(dest) QueryPerformanceCounter(dest)
|
||||
#else
|
||||
static inline void GET_TIME(LARGE_INTEGER *dest) {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
dest->QuadPart = (uint64_t)ts.tv_sec * 1000000000 + ts.tv_nsec;
|
||||
}
|
||||
#define QueryPerformanceFrequency(freq) ((freq)->QuadPart = 1000000000)
|
||||
#endif
|
||||
|
||||
double mhz(int verbose) {
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER lPerformanceCount_Start;
|
||||
LARGE_INTEGER lPerformanceCount_End;
|
||||
double mhz;
|
||||
double fTime;
|
||||
__int64 _i64StartCpuCounter;
|
||||
__int64 _i64EndCpuCounter;
|
||||
|
||||
#ifdef _WIN32
|
||||
HANDLE hThread = GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread, 0x1);
|
||||
#else
|
||||
cpu_set_t cpuset;
|
||||
CPU_ZERO(&cpuset);
|
||||
CPU_SET(0, &cpuset);
|
||||
sched_setaffinity(0, sizeof(cpuset), &cpuset);
|
||||
#endif
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
GET_TIME(&lPerformanceCount_Start);
|
||||
_i64StartCpuCounter = __rdtsc();
|
||||
Sleep(200);
|
||||
GET_TIME(&lPerformanceCount_End);
|
||||
_i64EndCpuCounter = __rdtsc();
|
||||
|
||||
fTime = (lPerformanceCount_End.QuadPart - lPerformanceCount_Start.QuadPart) /
|
||||
(double)lFrequency.QuadPart;
|
||||
mhz = (_i64EndCpuCounter - _i64StartCpuCounter) / (fTime * 1000000.0);
|
||||
|
||||
if (verbose > 0) {
|
||||
printf("CPU频率为: %.6fMHz.\n", mhz);
|
||||
}
|
||||
return mhz;
|
||||
}
|
||||
|
||||
double CPU_Factor1(void) {
|
||||
double result;
|
||||
int i, j, k;
|
||||
LARGE_INTEGER lStart, lEnd;
|
||||
LARGE_INTEGER lFrequency;
|
||||
double fTime;
|
||||
|
||||
#ifdef _WIN32
|
||||
HANDLE hThread = GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread, 0x1);
|
||||
#else
|
||||
cpu_set_t cpuset;
|
||||
CPU_ZERO(&cpuset);
|
||||
CPU_SET(0, &cpuset);
|
||||
sched_setaffinity(0, sizeof(cpuset), &cpuset);
|
||||
#endif
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
GET_TIME(&lStart);
|
||||
start_counter();
|
||||
|
||||
for (i = 0; i < 100; i++)
|
||||
for (j = 0; j < 1000; j++)
|
||||
for (k = 0; k < 1000; k++)
|
||||
;
|
||||
|
||||
result = get_counter();
|
||||
GET_TIME(&lEnd);
|
||||
|
||||
fTime = (lEnd.QuadPart - lStart.QuadPart) / (double)lFrequency.QuadPart;
|
||||
printf("CPU计算时长为: %f", result);
|
||||
printf("\t %f\n", fTime);
|
||||
return result;
|
||||
}
|
||||
|
||||
double CPU_Factor(void) {
|
||||
double frequency;
|
||||
double multiplier = 1000 * 1000 * 1000; // nano
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER start, stop;
|
||||
int i;
|
||||
const int known_instructions_per_loop = 27317;
|
||||
int iterations = 100000000;
|
||||
int g = 0;
|
||||
double normal_ticks_per_second;
|
||||
double ticks;
|
||||
double time;
|
||||
double loops_per_sec;
|
||||
double instructions_per_loop;
|
||||
double ratio;
|
||||
double actual_freq;
|
||||
|
||||
#ifdef _WIN32
|
||||
HANDLE hThread = GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread, 0x1);
|
||||
#else
|
||||
cpu_set_t cpuset;
|
||||
CPU_ZERO(&cpuset);
|
||||
CPU_SET(0, &cpuset);
|
||||
sched_setaffinity(0, sizeof(cpuset), &cpuset);
|
||||
#endif
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
frequency = (double)lFrequency.QuadPart;
|
||||
GET_TIME(&start);
|
||||
|
||||
for (i = 0; i < iterations; i++) {
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
}
|
||||
|
||||
GET_TIME(&stop);
|
||||
normal_ticks_per_second = frequency * 1000;
|
||||
ticks = (double)(stop.QuadPart - start.QuadPart);
|
||||
time = (ticks * multiplier) / frequency;
|
||||
loops_per_sec = iterations / (time / multiplier);
|
||||
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
|
||||
ratio = instructions_per_loop / known_instructions_per_loop;
|
||||
actual_freq = normal_ticks_per_second / ratio;
|
||||
|
||||
printf("Perf counter freq: %f\n", normal_ticks_per_second);
|
||||
printf("Loops per sec: %f\n", loops_per_sec);
|
||||
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
|
||||
printf("Presumed freq: %f\n", actual_freq);
|
||||
printf("ratio: %f\n", ratio);
|
||||
printf("time=%f\n", time);
|
||||
return ratio;
|
||||
}
|
||||
|
||||
229
perflab/matrix/clock.c.bak
Normal file
229
perflab/matrix/clock.c.bak
Normal file
@@ -0,0 +1,229 @@
|
||||
/* clock.c
|
||||
* Retrofitted to use thread-specific timers
|
||||
* and to get clock information from /proc/cpuinfo
|
||||
* (C) R. E. Bryant, 2010
|
||||
*
|
||||
*/
|
||||
|
||||
/* When this constant is not defined, uses time stamp counter */
|
||||
#define USE_POSIX 0
|
||||
|
||||
/* Choice to use cpu_gettime call or Intel time stamp counter directly */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <x86intrin.h>
|
||||
//#include <intrinsics.h>
|
||||
//#include <windows.h>
|
||||
#include <time.h>
|
||||
#include "clock.h"
|
||||
|
||||
/* Use x86 cycle counter */
|
||||
|
||||
/* Initialize the cycle counter */
|
||||
static unsigned cyc_hi = 0;
|
||||
static unsigned cyc_lo = 0;
|
||||
|
||||
/* Set *hi and *lo to the high and low order bits of the cycle counter.
|
||||
Implementation requires assembly code to use the rdtsc instruction. */
|
||||
void access_counter(unsigned *hi, unsigned *lo)
|
||||
{
|
||||
|
||||
long long counter;
|
||||
|
||||
counter = __rdtsc();
|
||||
(*hi) = (unsigned int)(counter >> 32);
|
||||
(*lo) = (unsigned int)counter;
|
||||
/*
|
||||
|
||||
LARGE_INTEGER lPerformanceCount;
|
||||
|
||||
QueryPerformanceCounter(&lPerformanceCount);
|
||||
(*hi) = (unsigned int)lPerformanceCount.HighPart;
|
||||
(*lo) = (unsigned int)lPerformanceCount.LowPart;
|
||||
// printf("%08X %08X\n",(*hi),(*lo));
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
/* Record the current value of the cycle counter. */
|
||||
void start_counter()
|
||||
{
|
||||
access_counter(&cyc_hi, &cyc_lo);
|
||||
}
|
||||
|
||||
/* Return the number of cycles since the last call to start_counter. */
|
||||
double get_counter()
|
||||
{
|
||||
unsigned ncyc_hi, ncyc_lo;
|
||||
unsigned hi, lo, borrow;
|
||||
double result;
|
||||
|
||||
/* Get cycle counter */
|
||||
access_counter(&ncyc_hi, &ncyc_lo);
|
||||
|
||||
/* Do double precision subtraction */
|
||||
lo = ncyc_lo - cyc_lo;
|
||||
borrow = cyc_lo > ncyc_lo;
|
||||
hi = ncyc_hi - cyc_hi - borrow;
|
||||
result = (double) hi * (1 << 30) * 4 + lo;
|
||||
return result;
|
||||
}
|
||||
void make_CPU_busy(void)
|
||||
{
|
||||
volatile double old_tick,new_tick;
|
||||
start_counter();
|
||||
old_tick = get_counter();
|
||||
new_tick = get_counter();
|
||||
while (new_tick - old_tick < 1000000000)
|
||||
new_tick = get_counter();
|
||||
}
|
||||
|
||||
//CPU的频率
|
||||
double mhz(int verbose)
|
||||
{
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER lPerformanceCount_Start;
|
||||
LARGE_INTEGER lPerformanceCount_End;
|
||||
double mhz;
|
||||
double fTime;
|
||||
__int64 _i64StartCpuCounter;
|
||||
__int64 _i64EndCpuCounter;
|
||||
//On a multiprocessor machine, it should not matter which processor is called.
|
||||
//However, you can get different results on different processors due to bugs in
|
||||
//the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function.
|
||||
HANDLE hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
|
||||
//主板上高精度定时器的晶振频率
|
||||
//这个定时器应该就是一片8253或者8254
|
||||
//在intel ich7中集成了8254
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
// if (verbose>0)
|
||||
// printf("高精度定时器的晶振频率:%1.0fHz.\n",(double)lFrequency.QuadPart);
|
||||
|
||||
//这个定时器每经过一个时钟周期,其计数器会+1
|
||||
QueryPerformanceCounter(&lPerformanceCount_Start);
|
||||
|
||||
//RDTSC指令:获取CPU经历的时钟周期数
|
||||
_i64StartCpuCounter=__rdtsc();
|
||||
|
||||
//延时长一点,误差会小一点
|
||||
//int nTemp=100000;
|
||||
//while (--nTemp);
|
||||
Sleep(200);
|
||||
|
||||
QueryPerformanceCounter(&lPerformanceCount_End);
|
||||
|
||||
_i64EndCpuCounter=__rdtsc();
|
||||
|
||||
//f=1/T => f=计数次数/(计数次数*T)
|
||||
//这里的“计数次数*T”就是时间差
|
||||
fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart)
|
||||
/(double)lFrequency.QuadPart;
|
||||
|
||||
mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0);
|
||||
if (verbose>0)
|
||||
printf("CPU频率为:%1.6fMHz.\n",mhz);
|
||||
return mhz;
|
||||
}
|
||||
|
||||
double CPU_Factor1(void)
|
||||
{
|
||||
double result;
|
||||
int i,j,k,ii,jj,kk;
|
||||
LARGE_INTEGER lStart,lEnd;
|
||||
LARGE_INTEGER lFrequency;
|
||||
HANDLE hThread;
|
||||
double fTime;
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
|
||||
ii = 43273;
|
||||
kk = 1238;
|
||||
result = 1;
|
||||
jj = 1244;
|
||||
|
||||
hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
QueryPerformanceCounter(&lStart);
|
||||
//_asm("cpuid");
|
||||
start_counter();
|
||||
for (i=0;i<100;i++)
|
||||
for (j=0;j<1000;j++)
|
||||
for (k=0;k<1000;k++)
|
||||
kk += kk*ii+jj;
|
||||
|
||||
result = get_counter();
|
||||
QueryPerformanceCounter(&lEnd);
|
||||
fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart);
|
||||
printf("CPU运行时间为%f",result);
|
||||
printf("\t %f\n",fTime);
|
||||
return result;
|
||||
}
|
||||
|
||||
double CPU_Factor(void)
|
||||
{
|
||||
double frequency;
|
||||
double multiplier = 1000 * 1000 * 1000;//nano
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER start,stop;
|
||||
HANDLE hThread;
|
||||
int i;
|
||||
const int gigahertz= 1000*1000*1000;
|
||||
const int known_instructions_per_loop = 27317;
|
||||
|
||||
int iterations = 100000000;
|
||||
int g = 0;
|
||||
double normal_ticks_per_second;
|
||||
double ticks;
|
||||
double time;
|
||||
double loops_per_sec;
|
||||
double instructions_per_loop;
|
||||
double ratio;
|
||||
double actual_freq;
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
frequency = (double)lFrequency.QuadPart;
|
||||
|
||||
hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
QueryPerformanceCounter(&start);
|
||||
for( i = 0; i < iterations; i++)
|
||||
{
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
}
|
||||
QueryPerformanceCounter(&stop);
|
||||
|
||||
//normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199
|
||||
normal_ticks_per_second = frequency * 1000;
|
||||
ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
|
||||
time = (ticks * multiplier) /frequency;
|
||||
loops_per_sec = iterations / (time/multiplier);
|
||||
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
|
||||
|
||||
ratio = (instructions_per_loop / known_instructions_per_loop);
|
||||
actual_freq = normal_ticks_per_second / ratio;
|
||||
/*
|
||||
actual_freq = normal_ticks_per_second / ratio;
|
||||
actual_freq = known_instructions_per_loop*iterations*multiplier/time;
|
||||
|
||||
2293 = x/time;
|
||||
|
||||
2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
|
||||
loops_per_sec = iterations*frequency / ticks
|
||||
|
||||
instructions_per_loop = / loops_per_sec;
|
||||
*/
|
||||
printf("Perf counter freq: %f\n", normal_ticks_per_second);
|
||||
printf("Loops per sec: %f\n", loops_per_sec);
|
||||
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
|
||||
printf("Presumed freq: %f\n", actual_freq);
|
||||
printf("ratio: %f\n", ratio);
|
||||
printf("time=%f\n",time);
|
||||
return ratio;
|
||||
}
|
||||
BIN
perflab/matrix/clock.o
Normal file
BIN
perflab/matrix/clock.o
Normal file
Binary file not shown.
BIN
perflab/matrix/cpe.o
Normal file
BIN
perflab/matrix/cpe.o
Normal file
Binary file not shown.
@@ -119,7 +119,7 @@ double fcyc(test_funct f, int *params)
|
||||
if (clear_cache)
|
||||
clear();
|
||||
start_counter();
|
||||
f(params);
|
||||
f((long*)params);
|
||||
cyc = get_counter();
|
||||
if (cyc > 0.0)
|
||||
add_sample(cyc);
|
||||
@@ -131,7 +131,7 @@ double fcyc(test_funct f, int *params)
|
||||
clear();
|
||||
start_counter();
|
||||
for (i=0;i<MAX_ITER_TIMES;i++)
|
||||
f(params);
|
||||
f((long*)params);
|
||||
cyc = get_counter()/MAX_ITER_TIMES;
|
||||
if (cyc > 0.0)
|
||||
add_sample(cyc);
|
||||
|
||||
BIN
perflab/matrix/fcyc.o
Normal file
BIN
perflab/matrix/fcyc.o
Normal file
Binary file not shown.
BIN
perflab/matrix/lsquare.o
Normal file
BIN
perflab/matrix/lsquare.o
Normal file
Binary file not shown.
BIN
perflab/matrix/matrix_test
Normal file
BIN
perflab/matrix/matrix_test
Normal file
Binary file not shown.
@@ -1,77 +1,69 @@
|
||||
/**************************************************************************
|
||||
行/列求和函数。按下面的要求编辑此文件:
|
||||
1. 将你的学号、姓名,以注释的方式写到下面;
|
||||
2. 实现不同版本的行列求和函数;
|
||||
3. 编辑rc_fun_rec rc_fun_tab数组,将你的最好的答案
|
||||
(最好的行和列求和、最好的列求和)作为数组的前两项
|
||||
??/???????????????????????????????
|
||||
1. ???????????????????????????????
|
||||
2. ??????????????????????
|
||||
3. ??rc_fun_rec rc_fun_tab??????????????????
|
||||
???????????????????????????????????????????
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
/*
|
||||
学号:201209054233
|
||||
姓名:夜半加班狂
|
||||
????201209054233
|
||||
??????????????
|
||||
*/
|
||||
|
||||
#include "rowcol.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "rowcol.h"
|
||||
#include <math.h>
|
||||
|
||||
/* 参考的列求和函数实现 */
|
||||
/* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是
|
||||
一样的,只是第2个参数不会用到而已
|
||||
/* ????????????????? */
|
||||
/* ???????????????????????????????????????????????
|
||||
??????????2?????????????????
|
||||
*/
|
||||
|
||||
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
|
||||
int i, j;
|
||||
for (j = 0; j < N; j++) {
|
||||
colsum[j] = 0;
|
||||
for (i = 0; i < N; i++)
|
||||
colsum[j] += M[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
/* ???????????????????? */
|
||||
/* ??????????????????????? */
|
||||
|
||||
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
|
||||
int i, j;
|
||||
for (i = 0; i < N; i++) {
|
||||
rowsum[i] = colsum[i] = 0;
|
||||
for (j = 0; j < N; j++) {
|
||||
colsum[j] = 0;
|
||||
for (i = 0; i < N; i++)
|
||||
colsum[j] += M[i][j];
|
||||
rowsum[i] += M[i][j];
|
||||
colsum[i] += M[j][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* 参考的列和行求和函数实现 */
|
||||
/* 计算矩阵中的每一行、每一列的和。 */
|
||||
|
||||
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (i = 0; i < N; i++) {
|
||||
rowsum[i] = colsum[i] = 0;
|
||||
for (j = 0; j < N; j++) {
|
||||
rowsum[i] += M[i][j];
|
||||
colsum[i] += M[j][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串")
|
||||
COL表示该函数仅仅计算每一列的和
|
||||
ROWCOL表示该函数计算每一行、每一列的和
|
||||
将你认为最好的两个实现,放在最前面。
|
||||
比如:
|
||||
{my_c_sum1, "超级垃圾列求和实现"},
|
||||
{my_rc_sum2, "好一点的行列求和实现"},
|
||||
/*
|
||||
????????????????????????????????????????, COL/ROWCOL, "?????????"??
|
||||
COL??????????????????????
|
||||
ROWCOL???????????????????????
|
||||
?????????????????????????????
|
||||
????
|
||||
{my_c_sum1, "?????????????????"},
|
||||
{my_rc_sum2, "??????????????????"},
|
||||
*/
|
||||
|
||||
rc_fun_rec rc_fun_tab[] =
|
||||
{
|
||||
rc_fun_rec rc_fun_tab[] = {
|
||||
|
||||
/* 第一项,应当是你写的最好列求和的函数实现 */
|
||||
/* ???????????????????????????????? */
|
||||
{c_sum, COL, "Best column sum"},
|
||||
/* 第二项,应当是你写的最好行列求和的函数实现 */
|
||||
/* ?????????????????????????????????? */
|
||||
{rc_sum, ROWCOL, "Best row and column sum"},
|
||||
|
||||
{c_sum, COL, "Column sum, reference implementation"},
|
||||
|
||||
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
|
||||
/* 下面的代码不能修改或者删除!!表明数组列表结束 */
|
||||
{NULL,ROWCOL,NULL}
|
||||
};
|
||||
/* ??????????????????????????????????????? */
|
||||
{NULL, ROWCOL, NULL}};
|
||||
162
perflab/matrix/rowcol.c~
Normal file
162
perflab/matrix/rowcol.c~
Normal file
@@ -0,0 +1,162 @@
|
||||
/**************************************************************************
|
||||
行/列求和函数。按下面的要求编辑此文件:
|
||||
1. 将你的学号、姓名,以注释的方式写到下面;
|
||||
2. 实现不同版本的行列求和函数;
|
||||
3. 编辑rc_fun_rec rc_fun_tab数组,将你的最好的答案
|
||||
(最好的行和列求和、最好的列求和)作为数组的前两项
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
学号:202302723005
|
||||
姓名:程景愉
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "rowcol.h"
|
||||
#include <math.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
/* 参考的列求和函数实现 */
|
||||
/* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是
|
||||
一样的,只是第2个参数不会用到而已
|
||||
*/
|
||||
|
||||
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (j = 0; j < N; j++) {
|
||||
colsum[j] = 0;
|
||||
for (i = 0; i < N; i++)
|
||||
colsum[j] += M[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* 参考的列和行求和函数实现 */
|
||||
/* 计算矩阵中的每一行、每一列的和。 */
|
||||
|
||||
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (i = 0; i < N; i++) {
|
||||
rowsum[i] = colsum[i] = 0;
|
||||
for (j = 0; j < N; j++) {
|
||||
rowsum[i] += M[i][j];
|
||||
colsum[i] += M[j][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* CUDA优化的列求和函数 */
|
||||
void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
// 分配设备内存
|
||||
int *d_M, *d_colsum;
|
||||
cudaMalloc(&d_M, N * N * sizeof(int));
|
||||
cudaMalloc(&d_colsum, N * sizeof(int));
|
||||
|
||||
// 将数据从主机复制到设备
|
||||
cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||
|
||||
// 定义CUDA核函数
|
||||
dim3 blockDim(256);
|
||||
dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||
|
||||
// 启动核函数
|
||||
cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
|
||||
|
||||
// 将结果从设备复制回主机
|
||||
cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
// 释放设备内存
|
||||
cudaFree(d_M);
|
||||
cudaFree(d_colsum);
|
||||
}
|
||||
|
||||
/* CUDA优化的行列求和函数 */
|
||||
void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
// 分配设备内存
|
||||
int *d_M, *d_rowsum, *d_colsum;
|
||||
cudaMalloc(&d_M, N * N * sizeof(int));
|
||||
cudaMalloc(&d_rowsum, N * sizeof(int));
|
||||
cudaMalloc(&d_colsum, N * sizeof(int));
|
||||
|
||||
// 将数据从主机复制到设备
|
||||
cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||
|
||||
// 定义CUDA核函数
|
||||
dim3 blockDim(256);
|
||||
dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||
|
||||
// 启动核函数
|
||||
cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
|
||||
|
||||
// 将结果从设备复制回主机
|
||||
cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
// 释放设备内存
|
||||
cudaFree(d_M);
|
||||
cudaFree(d_rowsum);
|
||||
cudaFree(d_colsum);
|
||||
}
|
||||
|
||||
/* CUDA核函数 - 列求和 */
|
||||
__global__ void cudaColumnSum(int *M, int *colsum)
|
||||
{
|
||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (col < N) {
|
||||
colsum[col] = 0;
|
||||
for (int row = 0; row < N; row++) {
|
||||
colsum[col] += M[row * N + col];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* CUDA核函数 - 行列求和 */
|
||||
__global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
|
||||
{
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < N) {
|
||||
// 计算行和
|
||||
rowsum[idx] = 0;
|
||||
for (int j = 0; j < N; j++) {
|
||||
rowsum[idx] += M[idx * N + j];
|
||||
}
|
||||
|
||||
// 计算列和
|
||||
colsum[idx] = 0;
|
||||
for (int i = 0; i < N; i++) {
|
||||
colsum[idx] += M[i * N + idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串")
|
||||
COL表示该函数仅仅计算每一列的和
|
||||
ROWCOL表示该函数计算每一行、每一列的和
|
||||
将你认为最好的两个实现,放在最前面。
|
||||
比如:
|
||||
{my_c_sum1, "超级垃圾列求和实现"},
|
||||
{my_rc_sum2, "好一点的行列求和实现"},
|
||||
*/
|
||||
|
||||
rc_fun_rec rc_fun_tab[] =
|
||||
{
|
||||
|
||||
/* 第一项,应当是你写的最好列求和的函数实现 */
|
||||
{cuda_c_sum, COL, "CUDA optimized column sum"},
|
||||
/* 第二项,应当是你写的最好行列求和的函数实现 */
|
||||
{cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
|
||||
|
||||
{c_sum, COL, "Column sum, reference implementation"},
|
||||
|
||||
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
|
||||
/* 下面的代码不能修改或者删除!!表明数组列表结束 */
|
||||
{NULL,ROWCOL,NULL}
|
||||
};
|
||||
BIN
perflab/matrix/rowcol.o
Normal file
BIN
perflab/matrix/rowcol.o
Normal file
Binary file not shown.
240
perflab/matrix/rowcol.y~
Normal file
240
perflab/matrix/rowcol.y~
Normal file
@@ -0,0 +1,240 @@
|
||||
/**************************************************************************
|
||||
靠/靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>
|
||||
1. 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>
|
||||
2. 靠靠靠靠靠靠靠靠靠靠靠
|
||||
3. 靠rc_fun_rec rc_fun_tab靠靠靠靠靠靠旷靠靠
|
||||
靠靠旷靠靠靠靠靠靠靠蹩靠靠靠靠靠靠靠靠靠靠<E99DA0>
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
靠靠201209054233
|
||||
靠靠靠靠靠靠靠
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "rowcol.h"
|
||||
#include <math.h>
|
||||
|
||||
/* 靠靠靠靠靠靠靠靠<E99DA0> */
|
||||
/* 靠靠靠靠靠<E99DA0><E99DA0>靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠蚩靠靠
|
||||
靠靠靠靠靠2靠靠靠靠靠旷靠靠<E99DA0>
|
||||
*/
|
||||
|
||||
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (j = 0; j < N; j++) {
|
||||
colsum[j] = 0;
|
||||
for (i = 0; i < N; i++)
|
||||
colsum[j] += M[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* 靠靠靠靠靠靠靠靠靠靠 */
|
||||
/* 靠靠靠靠靠<E99DA0><E99DA0>靠靠<E99DA0>靠靠靠 */
|
||||
|
||||
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (i = 0; i < N; i++) {
|
||||
rowsum[i] = colsum[i] = 0;
|
||||
for (j = 0; j < N; j++) {
|
||||
rowsum[i] += M[i][j];
|
||||
colsum[i] += M[j][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>靠靠靠靠靠靠靠<E99DA0>, COL/ROWCOL, "靠靠靠靠<E99DA0>"靠
|
||||
COL靠靠<E99DA0>靠靠靠靠靠<E99DA0>靠靠<E99DA0>
|
||||
ROWCOL靠靠<E99DA0>靠靠靠<E99DA0>靠靠<E99DA0>靠靠<E99DA0>
|
||||
靠靠靠靠靠蹩靠靠靠靠靠靠靠靠<E99DA0>
|
||||
靠靠
|
||||
{my_c_sum1, "靠靠靠靠靠靠靠靠<E99DA0>"},
|
||||
{my_rc_sum2, "靠靠靠靠靠靠靠靠靠"},
|
||||
*/
|
||||
|
||||
rc_fun_rec rc_fun_tab[] =
|
||||
{
|
||||
|
||||
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */
|
||||
{c_sum, COL, "Best column sum"},
|
||||
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */
|
||||
{rc_sum, ROWCOL, "Best row and column sum"},
|
||||
|
||||
{c_sum, COL, "Column sum, reference implementation"},
|
||||
|
||||
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
|
||||
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0> */
|
||||
{NULL,ROWCOL,NULL}
|
||||
};
|
||||
|
||||
// /**************************************************************************
|
||||
// 行/列求和函数。按下面的要求编辑此文件:
|
||||
// 1. 将你的学号、姓名,以注释的方式写到下面;
|
||||
// 2. 实现不同版本的行列求和函数;
|
||||
// 3. 编辑rc_fun_rec rc_fun_tab数组,将你的最好的答案
|
||||
// (最好的行和列求和、最好的列求和)作为数组的前两项
|
||||
// ***************************************************************************/
|
||||
//
|
||||
// /*
|
||||
// 学号:202302723005
|
||||
// 姓名:程景愉
|
||||
// */
|
||||
//
|
||||
//
|
||||
// #include <stdio.h>
|
||||
// #include <stdlib.h>
|
||||
// #include "rowcol.h"
|
||||
// #include <math.h>
|
||||
// #include <cuda_runtime.h>
|
||||
//
|
||||
// /* 参考的列求和函数实现 */
|
||||
// /* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是
|
||||
// 一样的,只是第2个参数不会用到而已
|
||||
// */
|
||||
//
|
||||
// void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// int i,j;
|
||||
// for (j = 0; j < N; j++) {
|
||||
// colsum[j] = 0;
|
||||
// for (i = 0; i < N; i++)
|
||||
// colsum[j] += M[i][j];
|
||||
// }
|
||||
// }
|
||||
//
|
||||
//
|
||||
// /* 参考的列和行求和函数实现 */
|
||||
// /* 计算矩阵中的每一行、每一列的和。 */
|
||||
//
|
||||
// void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// int i,j;
|
||||
// for (i = 0; i < N; i++) {
|
||||
// rowsum[i] = colsum[i] = 0;
|
||||
// for (j = 0; j < N; j++) {
|
||||
// rowsum[i] += M[i][j];
|
||||
// colsum[i] += M[j][i];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// /* CUDA优化的列求和函数 */
|
||||
// void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// // 分配设备内存
|
||||
// int *d_M, *d_colsum;
|
||||
// cudaMalloc(&d_M, N * N * sizeof(int));
|
||||
// cudaMalloc(&d_colsum, N * sizeof(int));
|
||||
//
|
||||
// // 将数据从主机复制到设备
|
||||
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||
//
|
||||
// // 定义CUDA核函数
|
||||
// dim3 blockDim(256);
|
||||
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||
//
|
||||
// // 启动核函数
|
||||
// cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
|
||||
//
|
||||
// // 将结果从设备复制回主机
|
||||
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
//
|
||||
// // 释放设备内存
|
||||
// cudaFree(d_M);
|
||||
// cudaFree(d_colsum);
|
||||
// }
|
||||
//
|
||||
// /* CUDA优化的行列求和函数 */
|
||||
// void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// // 分配设备内存
|
||||
// int *d_M, *d_rowsum, *d_colsum;
|
||||
// cudaMalloc(&d_M, N * N * sizeof(int));
|
||||
// cudaMalloc(&d_rowsum, N * sizeof(int));
|
||||
// cudaMalloc(&d_colsum, N * sizeof(int));
|
||||
//
|
||||
// // 将数据从主机复制到设备
|
||||
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||
//
|
||||
// // 定义CUDA核函数
|
||||
// dim3 blockDim(256);
|
||||
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||
//
|
||||
// // 启动核函数
|
||||
// cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
|
||||
//
|
||||
// // 将结果从设备复制回主机
|
||||
// cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
//
|
||||
// // 释放设备内存
|
||||
// cudaFree(d_M);
|
||||
// cudaFree(d_rowsum);
|
||||
// cudaFree(d_colsum);
|
||||
// }
|
||||
//
|
||||
// /* CUDA核函数 - 列求和 */
|
||||
// __global__ void cudaColumnSum(int *M, int *colsum)
|
||||
// {
|
||||
// int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// if (col < N) {
|
||||
// colsum[col] = 0;
|
||||
// for (int row = 0; row < N; row++) {
|
||||
// colsum[col] += M[row * N + col];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// /* CUDA核函数 - 行列求和 */
|
||||
// __global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
|
||||
// {
|
||||
// int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// if (idx < N) {
|
||||
// // 计算行和
|
||||
// rowsum[idx] = 0;
|
||||
// for (int j = 0; j < N; j++) {
|
||||
// rowsum[idx] += M[idx * N + j];
|
||||
// }
|
||||
//
|
||||
// // 计算列和
|
||||
// colsum[idx] = 0;
|
||||
// for (int i = 0; i < N; i++) {
|
||||
// colsum[idx] += M[i * N + idx];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// /*
|
||||
// 这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串")
|
||||
// COL表示该函数仅仅计算每一列的和
|
||||
// ROWCOL表示该函数计算每一行、每一列的和
|
||||
// 将你认为最好的两个实现,放在最前面。
|
||||
// 比如:
|
||||
// {my_c_sum1, "超级垃圾列求和实现"},
|
||||
// {my_rc_sum2, "好一点的行列求和实现"},
|
||||
// */
|
||||
//
|
||||
// rc_fun_rec rc_fun_tab[] =
|
||||
// {
|
||||
//
|
||||
// /* 第一项,应当是你写的最好列求和的函数实现 */
|
||||
// {cuda_c_sum, COL, "CUDA optimized column sum"},
|
||||
// /* 第二项,应当是你写的最好行列求和的函数实现 */
|
||||
// {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
|
||||
//
|
||||
// {c_sum, COL, "Column sum, reference implementation"},
|
||||
//
|
||||
// {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
//
|
||||
// /* 下面的代码不能修改或者删除!!表明数组列表结束 */
|
||||
// {NULL,ROWCOL,NULL}
|
||||
// };
|
||||
240
perflab/matrix/rowcol.z~
Normal file
240
perflab/matrix/rowcol.z~
Normal file
@@ -0,0 +1,240 @@
|
||||
/**************************************************************************
|
||||
靠/靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>
|
||||
1. 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>
|
||||
2. 靠靠靠靠靠靠靠靠靠靠靠
|
||||
3. 靠rc_fun_rec rc_fun_tab靠靠靠靠靠靠旷靠靠
|
||||
靠靠旷靠靠靠靠靠靠靠蹩靠靠靠靠靠靠靠靠靠靠<E99DA0>
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
靠靠201209054233
|
||||
靠靠靠靠靠靠靠
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "rowcol.h"
|
||||
#include <math.h>
|
||||
|
||||
/* 靠靠靠靠靠靠靠靠<E99DA0> */
|
||||
/* 靠靠靠靠靠<E99DA0><E99DA0>靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠蚩靠靠
|
||||
靠靠靠靠靠2靠靠靠靠靠旷靠靠<E99DA0>
|
||||
*/
|
||||
|
||||
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (j = 0; j < N; j++) {
|
||||
colsum[j] = 0;
|
||||
for (i = 0; i < N; i++)
|
||||
colsum[j] += M[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* 靠靠靠靠靠靠靠靠靠靠 */
|
||||
/* 靠靠靠靠靠<E99DA0><E99DA0>靠靠<E99DA0>靠靠靠 */
|
||||
|
||||
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (i = 0; i < N; i++) {
|
||||
rowsum[i] = colsum[i] = 0;
|
||||
for (j = 0; j < N; j++) {
|
||||
rowsum[i] += M[i][j];
|
||||
colsum[i] += M[j][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>靠靠靠靠靠靠靠<E99DA0>, COL/ROWCOL, "靠靠靠靠<E99DA0>"靠
|
||||
COL靠靠<E99DA0>靠靠靠靠靠<E99DA0>靠靠<E99DA0>
|
||||
ROWCOL靠靠<E99DA0>靠靠靠<E99DA0>靠靠<E99DA0>靠靠<E99DA0>
|
||||
靠靠靠靠靠蹩靠靠靠靠靠靠靠靠<E99DA0>
|
||||
靠靠
|
||||
{my_c_sum1, "靠靠靠靠靠靠靠靠<E99DA0>"},
|
||||
{my_rc_sum2, "靠靠靠靠靠靠靠靠靠"},
|
||||
*/
|
||||
|
||||
rc_fun_rec rc_fun_tab[] =
|
||||
{
|
||||
|
||||
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */
|
||||
{c_sum, COL, "Best column sum"},
|
||||
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */
|
||||
{rc_sum, ROWCOL, "Best row and column sum"},
|
||||
|
||||
{c_sum, COL, "Column sum, reference implementation"},
|
||||
|
||||
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
|
||||
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0> */
|
||||
{NULL,ROWCOL,NULL}
|
||||
};
|
||||
|
||||
// /**************************************************************************
|
||||
// 行/列求和函数。按下面的要求编辑此文件:
|
||||
// 1. 将你的学号、姓名,以注释的方式写到下面;
|
||||
// 2. 实现不同版本的行列求和函数;
|
||||
// 3. 编辑rc_fun_rec rc_fun_tab数组,将你的最好的答案
|
||||
// (最好的行和列求和、最好的列求和)作为数组的前两项
|
||||
// ***************************************************************************/
|
||||
//
|
||||
// /*
|
||||
// 学号:202302723005
|
||||
// 姓名:程景愉
|
||||
// */
|
||||
//
|
||||
//
|
||||
// #include <stdio.h>
|
||||
// #include <stdlib.h>
|
||||
// #include "rowcol.h"
|
||||
// #include <math.h>
|
||||
// #include <cuda_runtime.h>
|
||||
//
|
||||
// /* 参考的列求和函数实现 */
|
||||
// /* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是
|
||||
// 一样的,只是第2个参数不会用到而已
|
||||
// */
|
||||
//
|
||||
// void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// int i,j;
|
||||
// for (j = 0; j < N; j++) {
|
||||
// colsum[j] = 0;
|
||||
// for (i = 0; i < N; i++)
|
||||
// colsum[j] += M[i][j];
|
||||
// }
|
||||
// }
|
||||
//
|
||||
//
|
||||
// /* 参考的列和行求和函数实现 */
|
||||
// /* 计算矩阵中的每一行、每一列的和。 */
|
||||
//
|
||||
// void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// int i,j;
|
||||
// for (i = 0; i < N; i++) {
|
||||
// rowsum[i] = colsum[i] = 0;
|
||||
// for (j = 0; j < N; j++) {
|
||||
// rowsum[i] += M[i][j];
|
||||
// colsum[i] += M[j][i];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// /* CUDA优化的列求和函数 */
|
||||
// void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// // 分配设备内存
|
||||
// int *d_M, *d_colsum;
|
||||
// cudaMalloc(&d_M, N * N * sizeof(int));
|
||||
// cudaMalloc(&d_colsum, N * sizeof(int));
|
||||
//
|
||||
// // 将数据从主机复制到设备
|
||||
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||
//
|
||||
// // 定义CUDA核函数
|
||||
// dim3 blockDim(256);
|
||||
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||
//
|
||||
// // 启动核函数
|
||||
// cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
|
||||
//
|
||||
// // 将结果从设备复制回主机
|
||||
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
//
|
||||
// // 释放设备内存
|
||||
// cudaFree(d_M);
|
||||
// cudaFree(d_colsum);
|
||||
// }
|
||||
//
|
||||
// /* CUDA优化的行列求和函数 */
|
||||
// void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// // 分配设备内存
|
||||
// int *d_M, *d_rowsum, *d_colsum;
|
||||
// cudaMalloc(&d_M, N * N * sizeof(int));
|
||||
// cudaMalloc(&d_rowsum, N * sizeof(int));
|
||||
// cudaMalloc(&d_colsum, N * sizeof(int));
|
||||
//
|
||||
// // 将数据从主机复制到设备
|
||||
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||
//
|
||||
// // 定义CUDA核函数
|
||||
// dim3 blockDim(256);
|
||||
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||
//
|
||||
// // 启动核函数
|
||||
// cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
|
||||
//
|
||||
// // 将结果从设备复制回主机
|
||||
// cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
//
|
||||
// // 释放设备内存
|
||||
// cudaFree(d_M);
|
||||
// cudaFree(d_rowsum);
|
||||
// cudaFree(d_colsum);
|
||||
// }
|
||||
//
|
||||
// /* CUDA核函数 - 列求和 */
|
||||
// __global__ void cudaColumnSum(int *M, int *colsum)
|
||||
// {
|
||||
// int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// if (col < N) {
|
||||
// colsum[col] = 0;
|
||||
// for (int row = 0; row < N; row++) {
|
||||
// colsum[col] += M[row * N + col];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// /* CUDA核函数 - 行列求和 */
|
||||
// __global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
|
||||
// {
|
||||
// int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// if (idx < N) {
|
||||
// // 计算行和
|
||||
// rowsum[idx] = 0;
|
||||
// for (int j = 0; j < N; j++) {
|
||||
// rowsum[idx] += M[idx * N + j];
|
||||
// }
|
||||
//
|
||||
// // 计算列和
|
||||
// colsum[idx] = 0;
|
||||
// for (int i = 0; i < N; i++) {
|
||||
// colsum[idx] += M[i * N + idx];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// /*
|
||||
// 这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串")
|
||||
// COL表示该函数仅仅计算每一列的和
|
||||
// ROWCOL表示该函数计算每一行、每一列的和
|
||||
// 将你认为最好的两个实现,放在最前面。
|
||||
// 比如:
|
||||
// {my_c_sum1, "超级垃圾列求和实现"},
|
||||
// {my_rc_sum2, "好一点的行列求和实现"},
|
||||
// */
|
||||
//
|
||||
// rc_fun_rec rc_fun_tab[] =
|
||||
// {
|
||||
//
|
||||
// /* 第一项,应当是你写的最好列求和的函数实现 */
|
||||
// {cuda_c_sum, COL, "CUDA optimized column sum"},
|
||||
// /* 第二项,应当是你写的最好行列求和的函数实现 */
|
||||
// {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
|
||||
//
|
||||
// {c_sum, COL, "Column sum, reference implementation"},
|
||||
//
|
||||
// {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
//
|
||||
// /* 下面的代码不能修改或者删除!!表明数组列表结束 */
|
||||
// {NULL,ROWCOL,NULL}
|
||||
// };
|
||||
69
perflab/matrix/rowcol_202302723005.c
Normal file
69
perflab/matrix/rowcol_202302723005.c
Normal file
@@ -0,0 +1,69 @@
|
||||
/**************************************************************************
|
||||
??/???????????????????????????????
|
||||
1. ???????????????????????????????
|
||||
2. ??????????????????????
|
||||
3. ??rc_fun_rec rc_fun_tab??????????????????
|
||||
???????????????????????????????????????????
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
????201209054233
|
||||
??????????????
|
||||
*/
|
||||
|
||||
#include "rowcol.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/* ????????????????? */
|
||||
/* ???????????????????????????????????????????????
|
||||
??????????2?????????????????
|
||||
*/
|
||||
|
||||
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
|
||||
int i, j;
|
||||
for (j = 0; j < N; j++) {
|
||||
colsum[j] = 0;
|
||||
for (i = 0; i < N; i++)
|
||||
colsum[j] += M[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
/* ???????????????????? */
|
||||
/* ??????????????????????? */
|
||||
|
||||
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
|
||||
int i, j;
|
||||
for (i = 0; i < N; i++) {
|
||||
rowsum[i] = colsum[i] = 0;
|
||||
for (j = 0; j < N; j++) {
|
||||
rowsum[i] += M[i][j];
|
||||
colsum[i] += M[j][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
????????????????????????????????????????, COL/ROWCOL, "?????????"??
|
||||
COL??????????????????????
|
||||
ROWCOL???????????????????????
|
||||
?????????????????????????????
|
||||
????
|
||||
{my_c_sum1, "?????????????????"},
|
||||
{my_rc_sum2, "??????????????????"},
|
||||
*/
|
||||
|
||||
rc_fun_rec rc_fun_tab[] = {
|
||||
|
||||
/* ???????????????????????????????? */
|
||||
{c_sum, COL, "Best column sum"},
|
||||
/* ?????????????????????????????????? */
|
||||
{rc_sum, ROWCOL, "Best row and column sum"},
|
||||
|
||||
{c_sum, COL, "Column sum, reference implementation"},
|
||||
|
||||
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
|
||||
/* ??????????????????????????????????????? */
|
||||
{NULL, ROWCOL, NULL}};
|
||||
BIN
perflab/matrix/rowcol_202302723005.o
Normal file
BIN
perflab/matrix/rowcol_202302723005.o
Normal file
Binary file not shown.
@@ -1,9 +1,9 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
//#include <random.h>
|
||||
#include "rowcol.h"
|
||||
#include "fcyc.h"
|
||||
// #include <random.h>
|
||||
#include "clock.h"
|
||||
#include "fcyc.h"
|
||||
#include "rowcol.h"
|
||||
|
||||
#define MAX_ITER_COUNT 100
|
||||
|
||||
@@ -11,9 +11,9 @@
|
||||
static struct {
|
||||
double cref; /* Cycles taken by reference solution */
|
||||
double cbest; /* Cycles taken by our best implementation */
|
||||
} cstandard[2] =
|
||||
{{7.7, 6.40}, /* Column Sum */
|
||||
{9.75, 6.60} /* Row & Column Sum */
|
||||
} cstandard[2] = {
|
||||
{7.7, 6.40}, /* Column Sum */
|
||||
{9.75, 6.60} /* Row & Column Sum */
|
||||
};
|
||||
|
||||
/* Put in code to align matrix so that it starts on a cache block boundary.
|
||||
@@ -26,7 +26,7 @@ static struct {
|
||||
#define WPB 16
|
||||
|
||||
int verbose = 1;
|
||||
int data[N*N+WPB];
|
||||
int data[N * N + WPB];
|
||||
int *mstart;
|
||||
|
||||
typedef vector_t *row_t;
|
||||
@@ -37,137 +37,122 @@ vector_t rsref, csref, rcomp, ccomp;
|
||||
static void init_tests(void);
|
||||
extern void make_CPU_busy(void);
|
||||
|
||||
static void init_tests(void)
|
||||
{
|
||||
int i, j;
|
||||
size_t bytes_per_block = sizeof(int) * WPB;
|
||||
/* round mstart up to nearest block boundary */
|
||||
mstart = (int *)
|
||||
(((size_t) data + bytes_per_block-1) / bytes_per_block * bytes_per_block);
|
||||
for (i = 0; i < N; i++) {
|
||||
rsref[i] = csref[i] = 0;
|
||||
}
|
||||
for (i = 0; i < N; i++) {
|
||||
for (j = 0; j < N; j++) {
|
||||
int val = rand();
|
||||
mstart[i*N+j] = val;
|
||||
rsref[i] += val;
|
||||
csref[j] += val;
|
||||
}
|
||||
static void init_tests(void) {
|
||||
int i, j;
|
||||
size_t bytes_per_block = sizeof(int) * WPB;
|
||||
/* round mstart up to nearest block boundary */
|
||||
mstart = (int *)(((size_t)data + bytes_per_block - 1) / bytes_per_block *
|
||||
bytes_per_block);
|
||||
for (i = 0; i < N; i++) {
|
||||
rsref[i] = csref[i] = 0;
|
||||
}
|
||||
for (i = 0; i < N; i++) {
|
||||
for (j = 0; j < N; j++) {
|
||||
int val = rand();
|
||||
mstart[i * N + j] = val;
|
||||
rsref[i] += val;
|
||||
csref[j] += val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Test function on all values */
|
||||
int test_rc(rc_fun f, FILE *rpt, rc_comp_t rc_type) {
|
||||
int i;
|
||||
int ok = 1;
|
||||
int i;
|
||||
int ok = 1;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
rcomp[i] = ccomp[i] = 0xDEADBEEF;
|
||||
f((row_t)mstart, rcomp, ccomp);
|
||||
|
||||
for (i = 0; ok && i < N; i++) {
|
||||
if (rc_type == ROWCOL
|
||||
&& rsref[i] != rcomp[i]) {
|
||||
ok = 0;
|
||||
if (rpt)
|
||||
fprintf(rpt,
|
||||
"对第%d行的计算出错!正确结果是%d,但是计算得到%d\n",
|
||||
i, rsref[i], rcomp[i]);
|
||||
}
|
||||
if ((rc_type == ROWCOL || rc_type == COL)
|
||||
&& csref[i] != ccomp[i]) {
|
||||
ok = 0;
|
||||
if (rpt)
|
||||
fprintf(rpt,
|
||||
"对第%d列的计算出错!正确结果是%d,但是计算得到%d\n",
|
||||
i, csref[i], ccomp[i]);
|
||||
}
|
||||
for (i = 0; i < N; i++)
|
||||
rcomp[i] = ccomp[i] = 0xDEADBEEF;
|
||||
f((row_t)mstart, rcomp, ccomp);
|
||||
|
||||
for (i = 0; ok && i < N; i++) {
|
||||
if (rc_type == ROWCOL && rsref[i] != rcomp[i]) {
|
||||
ok = 0;
|
||||
if (rpt)
|
||||
fprintf(rpt, "对第%d行的计算出错!正确结果是%d,但是计算得到%d\n", i,
|
||||
rsref[i], rcomp[i]);
|
||||
}
|
||||
return ok;
|
||||
if ((rc_type == ROWCOL || rc_type == COL) && csref[i] != ccomp[i]) {
|
||||
ok = 0;
|
||||
if (rpt)
|
||||
fprintf(rpt, "对第%d列的计算出错!正确结果是%d,但是计算得到%d\n", i,
|
||||
csref[i], ccomp[i]);
|
||||
}
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
/* Kludgy way to interface to cycle measuring code */
|
||||
void do_test(int *intf)
|
||||
{
|
||||
rc_fun f = (rc_fun) intf;
|
||||
void do_test(int *intf) {
|
||||
rc_fun f = (rc_fun)intf;
|
||||
f((row_t)mstart, rcomp, ccomp);
|
||||
}
|
||||
|
||||
void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp)
|
||||
{
|
||||
int i;
|
||||
int *intf = (int *) f;
|
||||
void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp) {
|
||||
int i;
|
||||
int *intf = (int *)f;
|
||||
double t, cme;
|
||||
t = 0;
|
||||
if (verbose) printf("函数:%s\n", descr);
|
||||
if (verbose)
|
||||
printf("函数:%s\n", descr);
|
||||
if (test_rc(f, stdout, rc_type)) {
|
||||
make_CPU_busy();
|
||||
for (i=0;i<MAX_ITER_COUNT;i++)
|
||||
t += fcyc(do_test, intf);
|
||||
t = t/MAX_ITER_COUNT;
|
||||
cme = t/(N*N);
|
||||
if (verbose) printf(" 总周期数 = %.2f, 平均周期/元素 = %.2f\n",
|
||||
t, cme);
|
||||
make_CPU_busy();
|
||||
for (i = 0; i < MAX_ITER_COUNT; i++)
|
||||
t += fcyc((void (*)(long *))do_test, intf);
|
||||
t = t / MAX_ITER_COUNT;
|
||||
cme = t / (N * N);
|
||||
if (verbose)
|
||||
printf(" 总周期数 = %.2f, 平均周期/元素 = %.2f\n", t, cme);
|
||||
if (cycp)
|
||||
*cycp = cme;
|
||||
}
|
||||
}
|
||||
|
||||
/* Compute the grade achieved by function */
|
||||
static double compute_score(double cmeas, double cref, double cbest)
|
||||
{
|
||||
double sbest = cref/cbest;
|
||||
double smeas = cref/cmeas;
|
||||
if (smeas < 0.1*(sbest-1)+1)
|
||||
static double compute_score(double cmeas, double cref, double cbest) {
|
||||
double sbest = cref / cbest;
|
||||
double smeas = cref / cmeas;
|
||||
if (smeas < 0.1 * (sbest - 1) + 1)
|
||||
return 0;
|
||||
if (smeas > 1.1*(sbest-1)+1)
|
||||
if (smeas > 1.1 * (sbest - 1) + 1)
|
||||
return 120;
|
||||
return 100*((smeas-1.0)/(sbest-1.0) + 0.1);
|
||||
return 100 * ((smeas - 1.0) / (sbest - 1.0) + 0.1);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int main(int argc, char *argv[]) {
|
||||
int i;
|
||||
double cme;
|
||||
double cme_c,cme_rc;
|
||||
int EnableScore=0;
|
||||
|
||||
if (argc == 3)
|
||||
{
|
||||
EnableScore = 1;
|
||||
verbose = 0;
|
||||
double cme_c, cme_rc;
|
||||
int EnableScore = 0;
|
||||
|
||||
if (argc == 3) {
|
||||
EnableScore = 1;
|
||||
verbose = 0;
|
||||
}
|
||||
init_tests();
|
||||
set_fcyc_clear_cache(1); /* Set so that clears cache between runs */
|
||||
set_fcyc_clear_cache(1); /* Set so that clears cache between runs */
|
||||
for (i = 0; rc_fun_tab[i].f != NULL; i++) {
|
||||
cme = 100.0;
|
||||
time_rc(rc_fun_tab[i].f,
|
||||
rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme);
|
||||
if (i == 0)
|
||||
{
|
||||
cme_c = cme;
|
||||
if (EnableScore==0)
|
||||
{
|
||||
printf(" 最高\"列求和\"得分 ======================== %.0f\n",
|
||||
compute_score(cme, cstandard[0].cref, cstandard[0].cbest));
|
||||
}
|
||||
}
|
||||
if (i == 1)
|
||||
{
|
||||
cme_rc = cme;
|
||||
if (EnableScore==0)
|
||||
{
|
||||
printf(" 最高\"行和列求和\"得分 ====================== %.0f\n",
|
||||
compute_score(cme, cstandard[1].cref, cstandard[1].cbest));
|
||||
}
|
||||
}
|
||||
cme = 100.0;
|
||||
time_rc(rc_fun_tab[i].f, rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme);
|
||||
if (i == 0) {
|
||||
cme_c = cme;
|
||||
if (EnableScore == 0) {
|
||||
printf(" 最高\"列求和\"得分 ======================== %.0f\n",
|
||||
compute_score(cme, cstandard[0].cref, cstandard[0].cbest));
|
||||
}
|
||||
}
|
||||
if (i == 1) {
|
||||
cme_rc = cme;
|
||||
if (EnableScore == 0) {
|
||||
printf(" 最高\"行和列求和\"得分 ====================== %.0f\n",
|
||||
compute_score(cme, cstandard[1].cref, cstandard[1].cbest));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (EnableScore)
|
||||
printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n",cme_c,compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest),
|
||||
cme_rc,compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest));
|
||||
printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n", cme_c,
|
||||
compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest), cme_rc,
|
||||
compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest));
|
||||
return 0;
|
||||
}
|
||||
|
||||
BIN
perflab/matrix/rowcol_test.o
Normal file
BIN
perflab/matrix/rowcol_test.o
Normal file
Binary file not shown.
Reference in New Issue
Block a user