matrix fixed
This commit is contained in:
34
perflab/matrix/Makefile
Normal file
34
perflab/matrix/Makefile
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
CC = gcc
|
||||||
|
CFLAGS = -Wall -O1 -g
|
||||||
|
#LDFLAGS = -lm -lcudart -lcuda
|
||||||
|
|
||||||
|
# Source files
|
||||||
|
SRCS = rowcol_test.c clock.c cpe.c fcyc.c lsquare.c rowcol_202302723005.c
|
||||||
|
#CUDA_SRCS = rowcol.cu
|
||||||
|
OBJS = $(SRCS:.c=.o)
|
||||||
|
#rowcol.o
|
||||||
|
|
||||||
|
# Target executable
|
||||||
|
TARGET = matrix_test
|
||||||
|
|
||||||
|
# Default target
|
||||||
|
all: $(TARGET)
|
||||||
|
|
||||||
|
# Rule to build the executable
|
||||||
|
$(TARGET): $(OBJS)
|
||||||
|
$(CC) $(OBJS) -o $(TARGET) $(LDFLAGS)
|
||||||
|
|
||||||
|
# Rule to build object files
|
||||||
|
%.o: %.c
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
# Rule to build CUDA object files
|
||||||
|
#rowcol.o: rowcol.cu
|
||||||
|
# $(NVCC) $(CUDA_FLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
# Clean rule
|
||||||
|
clean:
|
||||||
|
rm -f $(OBJS) $(TARGET)
|
||||||
|
|
||||||
|
# Phony targets
|
||||||
|
.PHONY: all clean
|
||||||
@@ -2,228 +2,195 @@
|
|||||||
* Retrofitted to use thread-specific timers
|
* Retrofitted to use thread-specific timers
|
||||||
* and to get clock information from /proc/cpuinfo
|
* and to get clock information from /proc/cpuinfo
|
||||||
* (C) R. E. Bryant, 2010
|
* (C) R. E. Bryant, 2010
|
||||||
*
|
* Modified for cross-platform compatibility
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* When this constant is not defined, uses time stamp counter */
|
#define _GNU_SOURCE // For sched_setaffinity on Linux
|
||||||
#define USE_POSIX 0
|
#include <stdint.h>
|
||||||
|
|
||||||
/* Choice to use cpu_gettime call or Intel time stamp counter directly */
|
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
#include <intrin.h>
|
#include <intrin.h>
|
||||||
//#include <intrinsics.h>
|
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
|
#else
|
||||||
|
#include <sched.h>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <x86intrin.h>
|
||||||
|
typedef struct {
|
||||||
|
uint64_t QuadPart;
|
||||||
|
} LARGE_INTEGER;
|
||||||
|
typedef void *HANDLE;
|
||||||
|
#define __int64 long long
|
||||||
|
#define Sleep(ms) usleep((ms) * 1000)
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "clock.h"
|
#include "clock.h"
|
||||||
|
|
||||||
/* Use x86 cycle counter */
|
/* Use x86 cycle counter */
|
||||||
|
|
||||||
/* Initialize the cycle counter */
|
|
||||||
static unsigned cyc_hi = 0;
|
static unsigned cyc_hi = 0;
|
||||||
static unsigned cyc_lo = 0;
|
static unsigned cyc_lo = 0;
|
||||||
|
|
||||||
/* Set *hi and *lo to the high and low order bits of the cycle counter.
|
void access_counter(unsigned *hi, unsigned *lo) {
|
||||||
Implementation requires assembly code to use the rdtsc instruction. */
|
uint64_t counter = __rdtsc();
|
||||||
void access_counter(unsigned *hi, unsigned *lo)
|
*hi = (unsigned)(counter >> 32);
|
||||||
{
|
*lo = (unsigned)counter;
|
||||||
|
|
||||||
long long counter;
|
|
||||||
|
|
||||||
counter = __rdtsc();
|
|
||||||
(*hi) = (unsigned int)(counter >> 32);
|
|
||||||
(*lo) = (unsigned int)counter;
|
|
||||||
/*
|
|
||||||
|
|
||||||
LARGE_INTEGER lPerformanceCount;
|
|
||||||
|
|
||||||
QueryPerformanceCounter(&lPerformanceCount);
|
|
||||||
(*hi) = (unsigned int)lPerformanceCount.HighPart;
|
|
||||||
(*lo) = (unsigned int)lPerformanceCount.LowPart;
|
|
||||||
// printf("%08X %08X\n",(*hi),(*lo));
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void start_counter() { access_counter(&cyc_hi, &cyc_lo); }
|
||||||
|
|
||||||
/* Record the current value of the cycle counter. */
|
double get_counter() {
|
||||||
void start_counter()
|
unsigned ncyc_hi, ncyc_lo;
|
||||||
{
|
access_counter(&ncyc_hi, &ncyc_lo);
|
||||||
access_counter(&cyc_hi, &cyc_lo);
|
uint64_t start = ((uint64_t)cyc_hi << 32) | cyc_lo;
|
||||||
|
uint64_t end = ((uint64_t)ncyc_hi << 32) | ncyc_lo;
|
||||||
|
return (double)(end - start);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Return the number of cycles since the last call to start_counter. */
|
void make_CPU_busy(void) {
|
||||||
double get_counter()
|
volatile double old_tick = get_counter();
|
||||||
{
|
volatile double new_tick;
|
||||||
unsigned ncyc_hi, ncyc_lo;
|
while ((new_tick - old_tick) < 1000000000) {
|
||||||
unsigned hi, lo, borrow;
|
new_tick = get_counter();
|
||||||
double result;
|
}
|
||||||
|
|
||||||
/* Get cycle counter */
|
|
||||||
access_counter(&ncyc_hi, &ncyc_lo);
|
|
||||||
|
|
||||||
/* Do double precision subtraction */
|
|
||||||
lo = ncyc_lo - cyc_lo;
|
|
||||||
borrow = cyc_lo > ncyc_lo;
|
|
||||||
hi = ncyc_hi - cyc_hi - borrow;
|
|
||||||
result = (double) hi * (1 << 30) * 4 + lo;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
void make_CPU_busy(void)
|
|
||||||
{
|
|
||||||
volatile double old_tick,new_tick;
|
|
||||||
start_counter();
|
|
||||||
old_tick = get_counter();
|
|
||||||
new_tick = get_counter();
|
|
||||||
while (new_tick - old_tick < 1000000000)
|
|
||||||
new_tick = get_counter();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//CPU的频率
|
#ifdef _WIN32
|
||||||
double mhz(int verbose)
|
#define GET_TIME(dest) QueryPerformanceCounter(dest)
|
||||||
{
|
#else
|
||||||
LARGE_INTEGER lFrequency;
|
static inline void GET_TIME(LARGE_INTEGER *dest) {
|
||||||
LARGE_INTEGER lPerformanceCount_Start;
|
struct timespec ts;
|
||||||
LARGE_INTEGER lPerformanceCount_End;
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||||
double mhz;
|
dest->QuadPart = (uint64_t)ts.tv_sec * 1000000000 + ts.tv_nsec;
|
||||||
double fTime;
|
|
||||||
__int64 _i64StartCpuCounter;
|
|
||||||
__int64 _i64EndCpuCounter;
|
|
||||||
//On a multiprocessor machine, it should not matter which processor is called.
|
|
||||||
//However, you can get different results on different processors due to bugs in
|
|
||||||
//the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function.
|
|
||||||
HANDLE hThread=GetCurrentThread();
|
|
||||||
SetThreadAffinityMask(hThread,0x1);
|
|
||||||
|
|
||||||
//主板上高精度定时器的晶振频率
|
|
||||||
//这个定时器应该就是一片8253或者8254
|
|
||||||
//在intel ich7中集成了8254
|
|
||||||
QueryPerformanceFrequency(&lFrequency);
|
|
||||||
// if (verbose>0)
|
|
||||||
// printf("高精度定时器的晶振频率:%1.0fHz.\n",(double)lFrequency.QuadPart);
|
|
||||||
|
|
||||||
//这个定时器每经过一个时钟周期,其计数器会+1
|
|
||||||
QueryPerformanceCounter(&lPerformanceCount_Start);
|
|
||||||
|
|
||||||
//RDTSC指令:获取CPU经历的时钟周期数
|
|
||||||
_i64StartCpuCounter=__rdtsc();
|
|
||||||
|
|
||||||
//延时长一点,误差会小一点
|
|
||||||
//int nTemp=100000;
|
|
||||||
//while (--nTemp);
|
|
||||||
Sleep(200);
|
|
||||||
|
|
||||||
QueryPerformanceCounter(&lPerformanceCount_End);
|
|
||||||
|
|
||||||
_i64EndCpuCounter=__rdtsc();
|
|
||||||
|
|
||||||
//f=1/T => f=计数次数/(计数次数*T)
|
|
||||||
//这里的“计数次数*T”就是时间差
|
|
||||||
fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart)
|
|
||||||
/(double)lFrequency.QuadPart;
|
|
||||||
|
|
||||||
mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0);
|
|
||||||
if (verbose>0)
|
|
||||||
printf("CPU频率为:%1.6fMHz.\n",mhz);
|
|
||||||
return mhz;
|
|
||||||
}
|
}
|
||||||
|
#define QueryPerformanceFrequency(freq) ((freq)->QuadPart = 1000000000)
|
||||||
|
#endif
|
||||||
|
|
||||||
double CPU_Factor1(void)
|
double mhz(int verbose) {
|
||||||
{
|
|
||||||
double result;
|
|
||||||
int i,j,k,ii,jj,kk;
|
|
||||||
LARGE_INTEGER lStart,lEnd;
|
|
||||||
LARGE_INTEGER lFrequency;
|
LARGE_INTEGER lFrequency;
|
||||||
HANDLE hThread;
|
LARGE_INTEGER lPerformanceCount_Start;
|
||||||
|
LARGE_INTEGER lPerformanceCount_End;
|
||||||
|
double mhz;
|
||||||
double fTime;
|
double fTime;
|
||||||
|
__int64 _i64StartCpuCounter;
|
||||||
|
__int64 _i64EndCpuCounter;
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
HANDLE hThread = GetCurrentThread();
|
||||||
|
SetThreadAffinityMask(hThread, 0x1);
|
||||||
|
#else
|
||||||
|
cpu_set_t cpuset;
|
||||||
|
CPU_ZERO(&cpuset);
|
||||||
|
CPU_SET(0, &cpuset);
|
||||||
|
sched_setaffinity(0, sizeof(cpuset), &cpuset);
|
||||||
|
#endif
|
||||||
|
|
||||||
QueryPerformanceFrequency(&lFrequency);
|
QueryPerformanceFrequency(&lFrequency);
|
||||||
|
GET_TIME(&lPerformanceCount_Start);
|
||||||
|
_i64StartCpuCounter = __rdtsc();
|
||||||
|
Sleep(200);
|
||||||
|
GET_TIME(&lPerformanceCount_End);
|
||||||
|
_i64EndCpuCounter = __rdtsc();
|
||||||
|
|
||||||
ii = 43273;
|
fTime = (lPerformanceCount_End.QuadPart - lPerformanceCount_Start.QuadPart) /
|
||||||
kk = 1238;
|
(double)lFrequency.QuadPart;
|
||||||
result = 1;
|
mhz = (_i64EndCpuCounter - _i64StartCpuCounter) / (fTime * 1000000.0);
|
||||||
jj = 1244;
|
|
||||||
|
|
||||||
hThread=GetCurrentThread();
|
if (verbose > 0) {
|
||||||
SetThreadAffinityMask(hThread,0x1);
|
printf("CPU频率为: %.6fMHz.\n", mhz);
|
||||||
QueryPerformanceCounter(&lStart);
|
}
|
||||||
//_asm("cpuid");
|
return mhz;
|
||||||
start_counter();
|
|
||||||
for (i=0;i<100;i++)
|
|
||||||
for (j=0;j<1000;j++)
|
|
||||||
for (k=0;k<1000;k++)
|
|
||||||
kk += kk*ii+jj;
|
|
||||||
|
|
||||||
result = get_counter();
|
|
||||||
QueryPerformanceCounter(&lEnd);
|
|
||||||
fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart);
|
|
||||||
printf("CPU运行时间为%f",result);
|
|
||||||
printf("\t %f\n",fTime);
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
double CPU_Factor(void)
|
double CPU_Factor1(void) {
|
||||||
{
|
double result;
|
||||||
double frequency;
|
int i, j, k;
|
||||||
double multiplier = 1000 * 1000 * 1000;//nano
|
LARGE_INTEGER lStart, lEnd;
|
||||||
LARGE_INTEGER lFrequency;
|
LARGE_INTEGER lFrequency;
|
||||||
LARGE_INTEGER start,stop;
|
double fTime;
|
||||||
HANDLE hThread;
|
|
||||||
int i;
|
|
||||||
const int gigahertz= 1000*1000*1000;
|
|
||||||
const int known_instructions_per_loop = 27317;
|
|
||||||
|
|
||||||
int iterations = 100000000;
|
#ifdef _WIN32
|
||||||
int g = 0;
|
HANDLE hThread = GetCurrentThread();
|
||||||
double normal_ticks_per_second;
|
SetThreadAffinityMask(hThread, 0x1);
|
||||||
double ticks;
|
#else
|
||||||
double time;
|
cpu_set_t cpuset;
|
||||||
double loops_per_sec;
|
CPU_ZERO(&cpuset);
|
||||||
double instructions_per_loop;
|
CPU_SET(0, &cpuset);
|
||||||
double ratio;
|
sched_setaffinity(0, sizeof(cpuset), &cpuset);
|
||||||
double actual_freq;
|
#endif
|
||||||
|
|
||||||
QueryPerformanceFrequency(&lFrequency);
|
QueryPerformanceFrequency(&lFrequency);
|
||||||
frequency = (double)lFrequency.QuadPart;
|
GET_TIME(&lStart);
|
||||||
|
start_counter();
|
||||||
|
|
||||||
hThread=GetCurrentThread();
|
for (i = 0; i < 100; i++)
|
||||||
SetThreadAffinityMask(hThread,0x1);
|
for (j = 0; j < 1000; j++)
|
||||||
QueryPerformanceCounter(&start);
|
for (k = 0; k < 1000; k++)
|
||||||
for( i = 0; i < iterations; i++)
|
;
|
||||||
{
|
|
||||||
g++;
|
|
||||||
g++;
|
|
||||||
g++;
|
|
||||||
g++;
|
|
||||||
}
|
|
||||||
QueryPerformanceCounter(&stop);
|
|
||||||
|
|
||||||
//normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199
|
result = get_counter();
|
||||||
normal_ticks_per_second = frequency * 1000;
|
GET_TIME(&lEnd);
|
||||||
ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
|
|
||||||
time = (ticks * multiplier) /frequency;
|
|
||||||
loops_per_sec = iterations / (time/multiplier);
|
|
||||||
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
|
|
||||||
|
|
||||||
ratio = (instructions_per_loop / known_instructions_per_loop);
|
fTime = (lEnd.QuadPart - lStart.QuadPart) / (double)lFrequency.QuadPart;
|
||||||
actual_freq = normal_ticks_per_second / ratio;
|
printf("CPU计算时长为: %f", result);
|
||||||
/*
|
printf("\t %f\n", fTime);
|
||||||
actual_freq = normal_ticks_per_second / ratio;
|
return result;
|
||||||
actual_freq = known_instructions_per_loop*iterations*multiplier/time;
|
}
|
||||||
|
|
||||||
2293 = x/time;
|
double CPU_Factor(void) {
|
||||||
|
double frequency;
|
||||||
2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
|
double multiplier = 1000 * 1000 * 1000; // nano
|
||||||
loops_per_sec = iterations*frequency / ticks
|
LARGE_INTEGER lFrequency;
|
||||||
|
LARGE_INTEGER start, stop;
|
||||||
instructions_per_loop = / loops_per_sec;
|
int i;
|
||||||
*/
|
const int known_instructions_per_loop = 27317;
|
||||||
printf("Perf counter freq: %f\n", normal_ticks_per_second);
|
int iterations = 100000000;
|
||||||
printf("Loops per sec: %f\n", loops_per_sec);
|
int g = 0;
|
||||||
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
|
double normal_ticks_per_second;
|
||||||
printf("Presumed freq: %f\n", actual_freq);
|
double ticks;
|
||||||
printf("ratio: %f\n", ratio);
|
double time;
|
||||||
printf("time=%f\n",time);
|
double loops_per_sec;
|
||||||
return ratio;
|
double instructions_per_loop;
|
||||||
|
double ratio;
|
||||||
|
double actual_freq;
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
HANDLE hThread = GetCurrentThread();
|
||||||
|
SetThreadAffinityMask(hThread, 0x1);
|
||||||
|
#else
|
||||||
|
cpu_set_t cpuset;
|
||||||
|
CPU_ZERO(&cpuset);
|
||||||
|
CPU_SET(0, &cpuset);
|
||||||
|
sched_setaffinity(0, sizeof(cpuset), &cpuset);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
QueryPerformanceFrequency(&lFrequency);
|
||||||
|
frequency = (double)lFrequency.QuadPart;
|
||||||
|
GET_TIME(&start);
|
||||||
|
|
||||||
|
for (i = 0; i < iterations; i++) {
|
||||||
|
g++;
|
||||||
|
g++;
|
||||||
|
g++;
|
||||||
|
g++;
|
||||||
|
}
|
||||||
|
|
||||||
|
GET_TIME(&stop);
|
||||||
|
normal_ticks_per_second = frequency * 1000;
|
||||||
|
ticks = (double)(stop.QuadPart - start.QuadPart);
|
||||||
|
time = (ticks * multiplier) / frequency;
|
||||||
|
loops_per_sec = iterations / (time / multiplier);
|
||||||
|
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
|
||||||
|
ratio = instructions_per_loop / known_instructions_per_loop;
|
||||||
|
actual_freq = normal_ticks_per_second / ratio;
|
||||||
|
|
||||||
|
printf("Perf counter freq: %f\n", normal_ticks_per_second);
|
||||||
|
printf("Loops per sec: %f\n", loops_per_sec);
|
||||||
|
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
|
||||||
|
printf("Presumed freq: %f\n", actual_freq);
|
||||||
|
printf("ratio: %f\n", ratio);
|
||||||
|
printf("time=%f\n", time);
|
||||||
|
return ratio;
|
||||||
}
|
}
|
||||||
|
|||||||
229
perflab/matrix/clock.c.bak
Normal file
229
perflab/matrix/clock.c.bak
Normal file
@@ -0,0 +1,229 @@
|
|||||||
|
/* clock.c
|
||||||
|
* Retrofitted to use thread-specific timers
|
||||||
|
* and to get clock information from /proc/cpuinfo
|
||||||
|
* (C) R. E. Bryant, 2010
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* When this constant is not defined, uses time stamp counter */
|
||||||
|
#define USE_POSIX 0
|
||||||
|
|
||||||
|
/* Choice to use cpu_gettime call or Intel time stamp counter directly */
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <x86intrin.h>
|
||||||
|
//#include <intrinsics.h>
|
||||||
|
//#include <windows.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include "clock.h"
|
||||||
|
|
||||||
|
/* Use x86 cycle counter */
|
||||||
|
|
||||||
|
/* Initialize the cycle counter */
|
||||||
|
static unsigned cyc_hi = 0;
|
||||||
|
static unsigned cyc_lo = 0;
|
||||||
|
|
||||||
|
/* Set *hi and *lo to the high and low order bits of the cycle counter.
|
||||||
|
Implementation requires assembly code to use the rdtsc instruction. */
|
||||||
|
void access_counter(unsigned *hi, unsigned *lo)
|
||||||
|
{
|
||||||
|
|
||||||
|
long long counter;
|
||||||
|
|
||||||
|
counter = __rdtsc();
|
||||||
|
(*hi) = (unsigned int)(counter >> 32);
|
||||||
|
(*lo) = (unsigned int)counter;
|
||||||
|
/*
|
||||||
|
|
||||||
|
LARGE_INTEGER lPerformanceCount;
|
||||||
|
|
||||||
|
QueryPerformanceCounter(&lPerformanceCount);
|
||||||
|
(*hi) = (unsigned int)lPerformanceCount.HighPart;
|
||||||
|
(*lo) = (unsigned int)lPerformanceCount.LowPart;
|
||||||
|
// printf("%08X %08X\n",(*hi),(*lo));
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Record the current value of the cycle counter. */
|
||||||
|
void start_counter()
|
||||||
|
{
|
||||||
|
access_counter(&cyc_hi, &cyc_lo);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Return the number of cycles since the last call to start_counter. */
|
||||||
|
double get_counter()
|
||||||
|
{
|
||||||
|
unsigned ncyc_hi, ncyc_lo;
|
||||||
|
unsigned hi, lo, borrow;
|
||||||
|
double result;
|
||||||
|
|
||||||
|
/* Get cycle counter */
|
||||||
|
access_counter(&ncyc_hi, &ncyc_lo);
|
||||||
|
|
||||||
|
/* Do double precision subtraction */
|
||||||
|
lo = ncyc_lo - cyc_lo;
|
||||||
|
borrow = cyc_lo > ncyc_lo;
|
||||||
|
hi = ncyc_hi - cyc_hi - borrow;
|
||||||
|
result = (double) hi * (1 << 30) * 4 + lo;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
void make_CPU_busy(void)
|
||||||
|
{
|
||||||
|
volatile double old_tick,new_tick;
|
||||||
|
start_counter();
|
||||||
|
old_tick = get_counter();
|
||||||
|
new_tick = get_counter();
|
||||||
|
while (new_tick - old_tick < 1000000000)
|
||||||
|
new_tick = get_counter();
|
||||||
|
}
|
||||||
|
|
||||||
|
//CPU的频率
|
||||||
|
double mhz(int verbose)
|
||||||
|
{
|
||||||
|
LARGE_INTEGER lFrequency;
|
||||||
|
LARGE_INTEGER lPerformanceCount_Start;
|
||||||
|
LARGE_INTEGER lPerformanceCount_End;
|
||||||
|
double mhz;
|
||||||
|
double fTime;
|
||||||
|
__int64 _i64StartCpuCounter;
|
||||||
|
__int64 _i64EndCpuCounter;
|
||||||
|
//On a multiprocessor machine, it should not matter which processor is called.
|
||||||
|
//However, you can get different results on different processors due to bugs in
|
||||||
|
//the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function.
|
||||||
|
HANDLE hThread=GetCurrentThread();
|
||||||
|
SetThreadAffinityMask(hThread,0x1);
|
||||||
|
|
||||||
|
//主板上高精度定时器的晶振频率
|
||||||
|
//这个定时器应该就是一片8253或者8254
|
||||||
|
//在intel ich7中集成了8254
|
||||||
|
QueryPerformanceFrequency(&lFrequency);
|
||||||
|
// if (verbose>0)
|
||||||
|
// printf("高精度定时器的晶振频率:%1.0fHz.\n",(double)lFrequency.QuadPart);
|
||||||
|
|
||||||
|
//这个定时器每经过一个时钟周期,其计数器会+1
|
||||||
|
QueryPerformanceCounter(&lPerformanceCount_Start);
|
||||||
|
|
||||||
|
//RDTSC指令:获取CPU经历的时钟周期数
|
||||||
|
_i64StartCpuCounter=__rdtsc();
|
||||||
|
|
||||||
|
//延时长一点,误差会小一点
|
||||||
|
//int nTemp=100000;
|
||||||
|
//while (--nTemp);
|
||||||
|
Sleep(200);
|
||||||
|
|
||||||
|
QueryPerformanceCounter(&lPerformanceCount_End);
|
||||||
|
|
||||||
|
_i64EndCpuCounter=__rdtsc();
|
||||||
|
|
||||||
|
//f=1/T => f=计数次数/(计数次数*T)
|
||||||
|
//这里的“计数次数*T”就是时间差
|
||||||
|
fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart)
|
||||||
|
/(double)lFrequency.QuadPart;
|
||||||
|
|
||||||
|
mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0);
|
||||||
|
if (verbose>0)
|
||||||
|
printf("CPU频率为:%1.6fMHz.\n",mhz);
|
||||||
|
return mhz;
|
||||||
|
}
|
||||||
|
|
||||||
|
double CPU_Factor1(void)
|
||||||
|
{
|
||||||
|
double result;
|
||||||
|
int i,j,k,ii,jj,kk;
|
||||||
|
LARGE_INTEGER lStart,lEnd;
|
||||||
|
LARGE_INTEGER lFrequency;
|
||||||
|
HANDLE hThread;
|
||||||
|
double fTime;
|
||||||
|
|
||||||
|
QueryPerformanceFrequency(&lFrequency);
|
||||||
|
|
||||||
|
ii = 43273;
|
||||||
|
kk = 1238;
|
||||||
|
result = 1;
|
||||||
|
jj = 1244;
|
||||||
|
|
||||||
|
hThread=GetCurrentThread();
|
||||||
|
SetThreadAffinityMask(hThread,0x1);
|
||||||
|
QueryPerformanceCounter(&lStart);
|
||||||
|
//_asm("cpuid");
|
||||||
|
start_counter();
|
||||||
|
for (i=0;i<100;i++)
|
||||||
|
for (j=0;j<1000;j++)
|
||||||
|
for (k=0;k<1000;k++)
|
||||||
|
kk += kk*ii+jj;
|
||||||
|
|
||||||
|
result = get_counter();
|
||||||
|
QueryPerformanceCounter(&lEnd);
|
||||||
|
fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart);
|
||||||
|
printf("CPU运行时间为%f",result);
|
||||||
|
printf("\t %f\n",fTime);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
double CPU_Factor(void)
|
||||||
|
{
|
||||||
|
double frequency;
|
||||||
|
double multiplier = 1000 * 1000 * 1000;//nano
|
||||||
|
LARGE_INTEGER lFrequency;
|
||||||
|
LARGE_INTEGER start,stop;
|
||||||
|
HANDLE hThread;
|
||||||
|
int i;
|
||||||
|
const int gigahertz= 1000*1000*1000;
|
||||||
|
const int known_instructions_per_loop = 27317;
|
||||||
|
|
||||||
|
int iterations = 100000000;
|
||||||
|
int g = 0;
|
||||||
|
double normal_ticks_per_second;
|
||||||
|
double ticks;
|
||||||
|
double time;
|
||||||
|
double loops_per_sec;
|
||||||
|
double instructions_per_loop;
|
||||||
|
double ratio;
|
||||||
|
double actual_freq;
|
||||||
|
|
||||||
|
QueryPerformanceFrequency(&lFrequency);
|
||||||
|
frequency = (double)lFrequency.QuadPart;
|
||||||
|
|
||||||
|
hThread=GetCurrentThread();
|
||||||
|
SetThreadAffinityMask(hThread,0x1);
|
||||||
|
QueryPerformanceCounter(&start);
|
||||||
|
for( i = 0; i < iterations; i++)
|
||||||
|
{
|
||||||
|
g++;
|
||||||
|
g++;
|
||||||
|
g++;
|
||||||
|
g++;
|
||||||
|
}
|
||||||
|
QueryPerformanceCounter(&stop);
|
||||||
|
|
||||||
|
//normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199
|
||||||
|
normal_ticks_per_second = frequency * 1000;
|
||||||
|
ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
|
||||||
|
time = (ticks * multiplier) /frequency;
|
||||||
|
loops_per_sec = iterations / (time/multiplier);
|
||||||
|
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
|
||||||
|
|
||||||
|
ratio = (instructions_per_loop / known_instructions_per_loop);
|
||||||
|
actual_freq = normal_ticks_per_second / ratio;
|
||||||
|
/*
|
||||||
|
actual_freq = normal_ticks_per_second / ratio;
|
||||||
|
actual_freq = known_instructions_per_loop*iterations*multiplier/time;
|
||||||
|
|
||||||
|
2293 = x/time;
|
||||||
|
|
||||||
|
2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
|
||||||
|
loops_per_sec = iterations*frequency / ticks
|
||||||
|
|
||||||
|
instructions_per_loop = / loops_per_sec;
|
||||||
|
*/
|
||||||
|
printf("Perf counter freq: %f\n", normal_ticks_per_second);
|
||||||
|
printf("Loops per sec: %f\n", loops_per_sec);
|
||||||
|
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
|
||||||
|
printf("Presumed freq: %f\n", actual_freq);
|
||||||
|
printf("ratio: %f\n", ratio);
|
||||||
|
printf("time=%f\n",time);
|
||||||
|
return ratio;
|
||||||
|
}
|
||||||
BIN
perflab/matrix/clock.o
Normal file
BIN
perflab/matrix/clock.o
Normal file
Binary file not shown.
BIN
perflab/matrix/cpe.o
Normal file
BIN
perflab/matrix/cpe.o
Normal file
Binary file not shown.
@@ -119,7 +119,7 @@ double fcyc(test_funct f, int *params)
|
|||||||
if (clear_cache)
|
if (clear_cache)
|
||||||
clear();
|
clear();
|
||||||
start_counter();
|
start_counter();
|
||||||
f(params);
|
f((long*)params);
|
||||||
cyc = get_counter();
|
cyc = get_counter();
|
||||||
if (cyc > 0.0)
|
if (cyc > 0.0)
|
||||||
add_sample(cyc);
|
add_sample(cyc);
|
||||||
@@ -131,7 +131,7 @@ double fcyc(test_funct f, int *params)
|
|||||||
clear();
|
clear();
|
||||||
start_counter();
|
start_counter();
|
||||||
for (i=0;i<MAX_ITER_TIMES;i++)
|
for (i=0;i<MAX_ITER_TIMES;i++)
|
||||||
f(params);
|
f((long*)params);
|
||||||
cyc = get_counter()/MAX_ITER_TIMES;
|
cyc = get_counter()/MAX_ITER_TIMES;
|
||||||
if (cyc > 0.0)
|
if (cyc > 0.0)
|
||||||
add_sample(cyc);
|
add_sample(cyc);
|
||||||
|
|||||||
BIN
perflab/matrix/fcyc.o
Normal file
BIN
perflab/matrix/fcyc.o
Normal file
Binary file not shown.
BIN
perflab/matrix/lsquare.o
Normal file
BIN
perflab/matrix/lsquare.o
Normal file
Binary file not shown.
BIN
perflab/matrix/matrix_test
Normal file
BIN
perflab/matrix/matrix_test
Normal file
Binary file not shown.
@@ -1,77 +1,69 @@
|
|||||||
/**************************************************************************
|
/**************************************************************************
|
||||||
行/列求和函数。按下面的要求编辑此文件:
|
??/???????????????????????????????
|
||||||
1. 将你的学号、姓名,以注释的方式写到下面;
|
1. ???????????????????????????????
|
||||||
2. 实现不同版本的行列求和函数;
|
2. ??????????????????????
|
||||||
3. 编辑rc_fun_rec rc_fun_tab数组,将你的最好的答案
|
3. ??rc_fun_rec rc_fun_tab??????????????????
|
||||||
(最好的行和列求和、最好的列求和)作为数组的前两项
|
???????????????????????????????????????????
|
||||||
***************************************************************************/
|
***************************************************************************/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
学号:201209054233
|
????201209054233
|
||||||
姓名:夜半加班狂
|
??????????????
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "rowcol.h"
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
#include <stdio.h>
|
/* ????????????????? */
|
||||||
#include <stdlib.h>
|
/* ???????????????????????????????????????????????
|
||||||
#include "rowcol.h"
|
??????????2?????????????????
|
||||||
#include <math.h>
|
|
||||||
|
|
||||||
/* 参考的列求和函数实现 */
|
|
||||||
/* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是
|
|
||||||
一样的,只是第2个参数不会用到而已
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
|
||||||
{
|
int i, j;
|
||||||
int i,j;
|
for (j = 0; j < N; j++) {
|
||||||
|
colsum[j] = 0;
|
||||||
|
for (i = 0; i < N; i++)
|
||||||
|
colsum[j] += M[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ???????????????????? */
|
||||||
|
/* ??????????????????????? */
|
||||||
|
|
||||||
|
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
|
||||||
|
int i, j;
|
||||||
|
for (i = 0; i < N; i++) {
|
||||||
|
rowsum[i] = colsum[i] = 0;
|
||||||
for (j = 0; j < N; j++) {
|
for (j = 0; j < N; j++) {
|
||||||
colsum[j] = 0;
|
rowsum[i] += M[i][j];
|
||||||
for (i = 0; i < N; i++)
|
colsum[i] += M[j][i];
|
||||||
colsum[j] += M[i][j];
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* 参考的列和行求和函数实现 */
|
|
||||||
/* 计算矩阵中的每一行、每一列的和。 */
|
|
||||||
|
|
||||||
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
|
||||||
{
|
|
||||||
int i,j;
|
|
||||||
for (i = 0; i < N; i++) {
|
|
||||||
rowsum[i] = colsum[i] = 0;
|
|
||||||
for (j = 0; j < N; j++) {
|
|
||||||
rowsum[i] += M[i][j];
|
|
||||||
colsum[i] += M[j][i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串")
|
????????????????????????????????????????, COL/ROWCOL, "?????????"??
|
||||||
COL表示该函数仅仅计算每一列的和
|
COL??????????????????????
|
||||||
ROWCOL表示该函数计算每一行、每一列的和
|
ROWCOL???????????????????????
|
||||||
将你认为最好的两个实现,放在最前面。
|
?????????????????????????????
|
||||||
比如:
|
????
|
||||||
{my_c_sum1, "超级垃圾列求和实现"},
|
{my_c_sum1, "?????????????????"},
|
||||||
{my_rc_sum2, "好一点的行列求和实现"},
|
{my_rc_sum2, "??????????????????"},
|
||||||
*/
|
*/
|
||||||
|
|
||||||
rc_fun_rec rc_fun_tab[] =
|
rc_fun_rec rc_fun_tab[] = {
|
||||||
{
|
|
||||||
|
|
||||||
/* 第一项,应当是你写的最好列求和的函数实现 */
|
/* ???????????????????????????????? */
|
||||||
{c_sum, COL, "Best column sum"},
|
{c_sum, COL, "Best column sum"},
|
||||||
/* 第二项,应当是你写的最好行列求和的函数实现 */
|
/* ?????????????????????????????????? */
|
||||||
{rc_sum, ROWCOL, "Best row and column sum"},
|
{rc_sum, ROWCOL, "Best row and column sum"},
|
||||||
|
|
||||||
{c_sum, COL, "Column sum, reference implementation"},
|
{c_sum, COL, "Column sum, reference implementation"},
|
||||||
|
|
||||||
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||||
|
|
||||||
/* 下面的代码不能修改或者删除!!表明数组列表结束 */
|
/* ??????????????????????????????????????? */
|
||||||
{NULL,ROWCOL,NULL}
|
{NULL, ROWCOL, NULL}};
|
||||||
};
|
|
||||||
162
perflab/matrix/rowcol.c~
Normal file
162
perflab/matrix/rowcol.c~
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
/**************************************************************************
|
||||||
|
行/列求和函数。按下面的要求编辑此文件:
|
||||||
|
1. 将你的学号、姓名,以注释的方式写到下面;
|
||||||
|
2. 实现不同版本的行列求和函数;
|
||||||
|
3. 编辑rc_fun_rec rc_fun_tab数组,将你的最好的答案
|
||||||
|
(最好的行和列求和、最好的列求和)作为数组的前两项
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
/*
|
||||||
|
学号:202302723005
|
||||||
|
姓名:程景愉
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "rowcol.h"
|
||||||
|
#include <math.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
/* 参考的列求和函数实现 */
|
||||||
|
/* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是
|
||||||
|
一样的,只是第2个参数不会用到而已
|
||||||
|
*/
|
||||||
|
|
||||||
|
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
{
|
||||||
|
int i,j;
|
||||||
|
for (j = 0; j < N; j++) {
|
||||||
|
colsum[j] = 0;
|
||||||
|
for (i = 0; i < N; i++)
|
||||||
|
colsum[j] += M[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* 参考的列和行求和函数实现 */
|
||||||
|
/* 计算矩阵中的每一行、每一列的和。 */
|
||||||
|
|
||||||
|
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
{
|
||||||
|
int i,j;
|
||||||
|
for (i = 0; i < N; i++) {
|
||||||
|
rowsum[i] = colsum[i] = 0;
|
||||||
|
for (j = 0; j < N; j++) {
|
||||||
|
rowsum[i] += M[i][j];
|
||||||
|
colsum[i] += M[j][i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* CUDA优化的列求和函数 */
|
||||||
|
void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
{
|
||||||
|
// 分配设备内存
|
||||||
|
int *d_M, *d_colsum;
|
||||||
|
cudaMalloc(&d_M, N * N * sizeof(int));
|
||||||
|
cudaMalloc(&d_colsum, N * sizeof(int));
|
||||||
|
|
||||||
|
// 将数据从主机复制到设备
|
||||||
|
cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
// 定义CUDA核函数
|
||||||
|
dim3 blockDim(256);
|
||||||
|
dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||||
|
|
||||||
|
// 启动核函数
|
||||||
|
cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
|
||||||
|
|
||||||
|
// 将结果从设备复制回主机
|
||||||
|
cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
|
// 释放设备内存
|
||||||
|
cudaFree(d_M);
|
||||||
|
cudaFree(d_colsum);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* CUDA优化的行列求和函数 */
|
||||||
|
void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
{
|
||||||
|
// 分配设备内存
|
||||||
|
int *d_M, *d_rowsum, *d_colsum;
|
||||||
|
cudaMalloc(&d_M, N * N * sizeof(int));
|
||||||
|
cudaMalloc(&d_rowsum, N * sizeof(int));
|
||||||
|
cudaMalloc(&d_colsum, N * sizeof(int));
|
||||||
|
|
||||||
|
// 将数据从主机复制到设备
|
||||||
|
cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
// 定义CUDA核函数
|
||||||
|
dim3 blockDim(256);
|
||||||
|
dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||||
|
|
||||||
|
// 启动核函数
|
||||||
|
cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
|
||||||
|
|
||||||
|
// 将结果从设备复制回主机
|
||||||
|
cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
|
// 释放设备内存
|
||||||
|
cudaFree(d_M);
|
||||||
|
cudaFree(d_rowsum);
|
||||||
|
cudaFree(d_colsum);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* CUDA核函数 - 列求和 */
|
||||||
|
__global__ void cudaColumnSum(int *M, int *colsum)
|
||||||
|
{
|
||||||
|
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (col < N) {
|
||||||
|
colsum[col] = 0;
|
||||||
|
for (int row = 0; row < N; row++) {
|
||||||
|
colsum[col] += M[row * N + col];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* CUDA核函数 - 行列求和 */
|
||||||
|
__global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
|
||||||
|
{
|
||||||
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (idx < N) {
|
||||||
|
// 计算行和
|
||||||
|
rowsum[idx] = 0;
|
||||||
|
for (int j = 0; j < N; j++) {
|
||||||
|
rowsum[idx] += M[idx * N + j];
|
||||||
|
}
|
||||||
|
|
||||||
|
// 计算列和
|
||||||
|
colsum[idx] = 0;
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
colsum[idx] += M[i * N + idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串")
|
||||||
|
COL表示该函数仅仅计算每一列的和
|
||||||
|
ROWCOL表示该函数计算每一行、每一列的和
|
||||||
|
将你认为最好的两个实现,放在最前面。
|
||||||
|
比如:
|
||||||
|
{my_c_sum1, "超级垃圾列求和实现"},
|
||||||
|
{my_rc_sum2, "好一点的行列求和实现"},
|
||||||
|
*/
|
||||||
|
|
||||||
|
rc_fun_rec rc_fun_tab[] =
|
||||||
|
{
|
||||||
|
|
||||||
|
/* 第一项,应当是你写的最好列求和的函数实现 */
|
||||||
|
{cuda_c_sum, COL, "CUDA optimized column sum"},
|
||||||
|
/* 第二项,应当是你写的最好行列求和的函数实现 */
|
||||||
|
{cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
|
||||||
|
|
||||||
|
{c_sum, COL, "Column sum, reference implementation"},
|
||||||
|
|
||||||
|
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||||
|
|
||||||
|
/* 下面的代码不能修改或者删除!!表明数组列表结束 */
|
||||||
|
{NULL,ROWCOL,NULL}
|
||||||
|
};
|
||||||
BIN
perflab/matrix/rowcol.o
Normal file
BIN
perflab/matrix/rowcol.o
Normal file
Binary file not shown.
240
perflab/matrix/rowcol.y~
Normal file
240
perflab/matrix/rowcol.y~
Normal file
@@ -0,0 +1,240 @@
|
|||||||
|
/**************************************************************************
|
||||||
|
靠/靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>
|
||||||
|
1. 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>
|
||||||
|
2. 靠靠靠靠靠靠靠靠靠靠靠
|
||||||
|
3. 靠rc_fun_rec rc_fun_tab靠靠靠靠靠靠旷靠靠
|
||||||
|
靠靠旷靠靠靠靠靠靠靠蹩靠靠靠靠靠靠靠靠靠靠<E99DA0>
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
/*
|
||||||
|
靠靠201209054233
|
||||||
|
靠靠靠靠靠靠靠
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "rowcol.h"
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
/* 靠靠靠靠靠靠靠靠<E99DA0> */
|
||||||
|
/* 靠靠靠靠靠<E99DA0><E99DA0>靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠蚩靠靠
|
||||||
|
靠靠靠靠靠2靠靠靠靠靠旷靠靠<E99DA0>
|
||||||
|
*/
|
||||||
|
|
||||||
|
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
{
|
||||||
|
int i,j;
|
||||||
|
for (j = 0; j < N; j++) {
|
||||||
|
colsum[j] = 0;
|
||||||
|
for (i = 0; i < N; i++)
|
||||||
|
colsum[j] += M[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* 靠靠靠靠靠靠靠靠靠靠 */
|
||||||
|
/* 靠靠靠靠靠<E99DA0><E99DA0>靠靠<E99DA0>靠靠靠 */
|
||||||
|
|
||||||
|
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
{
|
||||||
|
int i,j;
|
||||||
|
for (i = 0; i < N; i++) {
|
||||||
|
rowsum[i] = colsum[i] = 0;
|
||||||
|
for (j = 0; j < N; j++) {
|
||||||
|
rowsum[i] += M[i][j];
|
||||||
|
colsum[i] += M[j][i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>靠靠靠靠靠靠靠<E99DA0>, COL/ROWCOL, "靠靠靠靠<E99DA0>"靠
|
||||||
|
COL靠靠<E99DA0>靠靠靠靠靠<E99DA0>靠靠<E99DA0>
|
||||||
|
ROWCOL靠靠<E99DA0>靠靠靠<E99DA0>靠靠<E99DA0>靠靠<E99DA0>
|
||||||
|
靠靠靠靠靠蹩靠靠靠靠靠靠靠靠<E99DA0>
|
||||||
|
靠靠
|
||||||
|
{my_c_sum1, "靠靠靠靠靠靠靠靠<E99DA0>"},
|
||||||
|
{my_rc_sum2, "靠靠靠靠靠靠靠靠靠"},
|
||||||
|
*/
|
||||||
|
|
||||||
|
rc_fun_rec rc_fun_tab[] =
|
||||||
|
{
|
||||||
|
|
||||||
|
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */
|
||||||
|
{c_sum, COL, "Best column sum"},
|
||||||
|
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */
|
||||||
|
{rc_sum, ROWCOL, "Best row and column sum"},
|
||||||
|
|
||||||
|
{c_sum, COL, "Column sum, reference implementation"},
|
||||||
|
|
||||||
|
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||||
|
|
||||||
|
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0> */
|
||||||
|
{NULL,ROWCOL,NULL}
|
||||||
|
};
|
||||||
|
|
||||||
|
// /**************************************************************************
|
||||||
|
// 行/列求和函数。按下面的要求编辑此文件:
|
||||||
|
// 1. 将你的学号、姓名,以注释的方式写到下面;
|
||||||
|
// 2. 实现不同版本的行列求和函数;
|
||||||
|
// 3. 编辑rc_fun_rec rc_fun_tab数组,将你的最好的答案
|
||||||
|
// (最好的行和列求和、最好的列求和)作为数组的前两项
|
||||||
|
// ***************************************************************************/
|
||||||
|
//
|
||||||
|
// /*
|
||||||
|
// 学号:202302723005
|
||||||
|
// 姓名:程景愉
|
||||||
|
// */
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// #include <stdio.h>
|
||||||
|
// #include <stdlib.h>
|
||||||
|
// #include "rowcol.h"
|
||||||
|
// #include <math.h>
|
||||||
|
// #include <cuda_runtime.h>
|
||||||
|
//
|
||||||
|
// /* 参考的列求和函数实现 */
|
||||||
|
// /* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是
|
||||||
|
// 一样的,只是第2个参数不会用到而已
|
||||||
|
// */
|
||||||
|
//
|
||||||
|
// void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
// {
|
||||||
|
// int i,j;
|
||||||
|
// for (j = 0; j < N; j++) {
|
||||||
|
// colsum[j] = 0;
|
||||||
|
// for (i = 0; i < N; i++)
|
||||||
|
// colsum[j] += M[i][j];
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// /* 参考的列和行求和函数实现 */
|
||||||
|
// /* 计算矩阵中的每一行、每一列的和。 */
|
||||||
|
//
|
||||||
|
// void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
// {
|
||||||
|
// int i,j;
|
||||||
|
// for (i = 0; i < N; i++) {
|
||||||
|
// rowsum[i] = colsum[i] = 0;
|
||||||
|
// for (j = 0; j < N; j++) {
|
||||||
|
// rowsum[i] += M[i][j];
|
||||||
|
// colsum[i] += M[j][i];
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// /* CUDA优化的列求和函数 */
|
||||||
|
// void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
// {
|
||||||
|
// // 分配设备内存
|
||||||
|
// int *d_M, *d_colsum;
|
||||||
|
// cudaMalloc(&d_M, N * N * sizeof(int));
|
||||||
|
// cudaMalloc(&d_colsum, N * sizeof(int));
|
||||||
|
//
|
||||||
|
// // 将数据从主机复制到设备
|
||||||
|
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||||
|
//
|
||||||
|
// // 定义CUDA核函数
|
||||||
|
// dim3 blockDim(256);
|
||||||
|
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||||
|
//
|
||||||
|
// // 启动核函数
|
||||||
|
// cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
|
||||||
|
//
|
||||||
|
// // 将结果从设备复制回主机
|
||||||
|
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
//
|
||||||
|
// // 释放设备内存
|
||||||
|
// cudaFree(d_M);
|
||||||
|
// cudaFree(d_colsum);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// /* CUDA优化的行列求和函数 */
|
||||||
|
// void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
// {
|
||||||
|
// // 分配设备内存
|
||||||
|
// int *d_M, *d_rowsum, *d_colsum;
|
||||||
|
// cudaMalloc(&d_M, N * N * sizeof(int));
|
||||||
|
// cudaMalloc(&d_rowsum, N * sizeof(int));
|
||||||
|
// cudaMalloc(&d_colsum, N * sizeof(int));
|
||||||
|
//
|
||||||
|
// // 将数据从主机复制到设备
|
||||||
|
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||||
|
//
|
||||||
|
// // 定义CUDA核函数
|
||||||
|
// dim3 blockDim(256);
|
||||||
|
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||||
|
//
|
||||||
|
// // 启动核函数
|
||||||
|
// cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
|
||||||
|
//
|
||||||
|
// // 将结果从设备复制回主机
|
||||||
|
// cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
//
|
||||||
|
// // 释放设备内存
|
||||||
|
// cudaFree(d_M);
|
||||||
|
// cudaFree(d_rowsum);
|
||||||
|
// cudaFree(d_colsum);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// /* CUDA核函数 - 列求和 */
|
||||||
|
// __global__ void cudaColumnSum(int *M, int *colsum)
|
||||||
|
// {
|
||||||
|
// int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
// if (col < N) {
|
||||||
|
// colsum[col] = 0;
|
||||||
|
// for (int row = 0; row < N; row++) {
|
||||||
|
// colsum[col] += M[row * N + col];
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// /* CUDA核函数 - 行列求和 */
|
||||||
|
// __global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
|
||||||
|
// {
|
||||||
|
// int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
// if (idx < N) {
|
||||||
|
// // 计算行和
|
||||||
|
// rowsum[idx] = 0;
|
||||||
|
// for (int j = 0; j < N; j++) {
|
||||||
|
// rowsum[idx] += M[idx * N + j];
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// // 计算列和
|
||||||
|
// colsum[idx] = 0;
|
||||||
|
// for (int i = 0; i < N; i++) {
|
||||||
|
// colsum[idx] += M[i * N + idx];
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// /*
|
||||||
|
// 这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串")
|
||||||
|
// COL表示该函数仅仅计算每一列的和
|
||||||
|
// ROWCOL表示该函数计算每一行、每一列的和
|
||||||
|
// 将你认为最好的两个实现,放在最前面。
|
||||||
|
// 比如:
|
||||||
|
// {my_c_sum1, "超级垃圾列求和实现"},
|
||||||
|
// {my_rc_sum2, "好一点的行列求和实现"},
|
||||||
|
// */
|
||||||
|
//
|
||||||
|
// rc_fun_rec rc_fun_tab[] =
|
||||||
|
// {
|
||||||
|
//
|
||||||
|
// /* 第一项,应当是你写的最好列求和的函数实现 */
|
||||||
|
// {cuda_c_sum, COL, "CUDA optimized column sum"},
|
||||||
|
// /* 第二项,应当是你写的最好行列求和的函数实现 */
|
||||||
|
// {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
|
||||||
|
//
|
||||||
|
// {c_sum, COL, "Column sum, reference implementation"},
|
||||||
|
//
|
||||||
|
// {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||||
|
//
|
||||||
|
// /* 下面的代码不能修改或者删除!!表明数组列表结束 */
|
||||||
|
// {NULL,ROWCOL,NULL}
|
||||||
|
// };
|
||||||
240
perflab/matrix/rowcol.z~
Normal file
240
perflab/matrix/rowcol.z~
Normal file
@@ -0,0 +1,240 @@
|
|||||||
|
/**************************************************************************
|
||||||
|
靠/靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>
|
||||||
|
1. 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>
|
||||||
|
2. 靠靠靠靠靠靠靠靠靠靠靠
|
||||||
|
3. 靠rc_fun_rec rc_fun_tab靠靠靠靠靠靠旷靠靠
|
||||||
|
靠靠旷靠靠靠靠靠靠靠蹩靠靠靠靠靠靠靠靠靠靠<E99DA0>
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
/*
|
||||||
|
靠靠201209054233
|
||||||
|
靠靠靠靠靠靠靠
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "rowcol.h"
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
/* 靠靠靠靠靠靠靠靠<E99DA0> */
|
||||||
|
/* 靠靠靠靠靠<E99DA0><E99DA0>靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠蚩靠靠
|
||||||
|
靠靠靠靠靠2靠靠靠靠靠旷靠靠<E99DA0>
|
||||||
|
*/
|
||||||
|
|
||||||
|
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
{
|
||||||
|
int i,j;
|
||||||
|
for (j = 0; j < N; j++) {
|
||||||
|
colsum[j] = 0;
|
||||||
|
for (i = 0; i < N; i++)
|
||||||
|
colsum[j] += M[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* 靠靠靠靠靠靠靠靠靠靠 */
|
||||||
|
/* 靠靠靠靠靠<E99DA0><E99DA0>靠靠<E99DA0>靠靠靠 */
|
||||||
|
|
||||||
|
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
{
|
||||||
|
int i,j;
|
||||||
|
for (i = 0; i < N; i++) {
|
||||||
|
rowsum[i] = colsum[i] = 0;
|
||||||
|
for (j = 0; j < N; j++) {
|
||||||
|
rowsum[i] += M[i][j];
|
||||||
|
colsum[i] += M[j][i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>靠靠靠靠靠靠靠<E99DA0>, COL/ROWCOL, "靠靠靠靠<E99DA0>"靠
|
||||||
|
COL靠靠<E99DA0>靠靠靠靠靠<E99DA0>靠靠<E99DA0>
|
||||||
|
ROWCOL靠靠<E99DA0>靠靠靠<E99DA0>靠靠<E99DA0>靠靠<E99DA0>
|
||||||
|
靠靠靠靠靠蹩靠靠靠靠靠靠靠靠<E99DA0>
|
||||||
|
靠靠
|
||||||
|
{my_c_sum1, "靠靠靠靠靠靠靠靠<E99DA0>"},
|
||||||
|
{my_rc_sum2, "靠靠靠靠靠靠靠靠靠"},
|
||||||
|
*/
|
||||||
|
|
||||||
|
rc_fun_rec rc_fun_tab[] =
|
||||||
|
{
|
||||||
|
|
||||||
|
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */
|
||||||
|
{c_sum, COL, "Best column sum"},
|
||||||
|
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */
|
||||||
|
{rc_sum, ROWCOL, "Best row and column sum"},
|
||||||
|
|
||||||
|
{c_sum, COL, "Column sum, reference implementation"},
|
||||||
|
|
||||||
|
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||||
|
|
||||||
|
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0> */
|
||||||
|
{NULL,ROWCOL,NULL}
|
||||||
|
};
|
||||||
|
|
||||||
|
// /**************************************************************************
|
||||||
|
// 行/列求和函数。按下面的要求编辑此文件:
|
||||||
|
// 1. 将你的学号、姓名,以注释的方式写到下面;
|
||||||
|
// 2. 实现不同版本的行列求和函数;
|
||||||
|
// 3. 编辑rc_fun_rec rc_fun_tab数组,将你的最好的答案
|
||||||
|
// (最好的行和列求和、最好的列求和)作为数组的前两项
|
||||||
|
// ***************************************************************************/
|
||||||
|
//
|
||||||
|
// /*
|
||||||
|
// 学号:202302723005
|
||||||
|
// 姓名:程景愉
|
||||||
|
// */
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// #include <stdio.h>
|
||||||
|
// #include <stdlib.h>
|
||||||
|
// #include "rowcol.h"
|
||||||
|
// #include <math.h>
|
||||||
|
// #include <cuda_runtime.h>
|
||||||
|
//
|
||||||
|
// /* 参考的列求和函数实现 */
|
||||||
|
// /* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是
|
||||||
|
// 一样的,只是第2个参数不会用到而已
|
||||||
|
// */
|
||||||
|
//
|
||||||
|
// void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
// {
|
||||||
|
// int i,j;
|
||||||
|
// for (j = 0; j < N; j++) {
|
||||||
|
// colsum[j] = 0;
|
||||||
|
// for (i = 0; i < N; i++)
|
||||||
|
// colsum[j] += M[i][j];
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// /* 参考的列和行求和函数实现 */
|
||||||
|
// /* 计算矩阵中的每一行、每一列的和。 */
|
||||||
|
//
|
||||||
|
// void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
// {
|
||||||
|
// int i,j;
|
||||||
|
// for (i = 0; i < N; i++) {
|
||||||
|
// rowsum[i] = colsum[i] = 0;
|
||||||
|
// for (j = 0; j < N; j++) {
|
||||||
|
// rowsum[i] += M[i][j];
|
||||||
|
// colsum[i] += M[j][i];
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// /* CUDA优化的列求和函数 */
|
||||||
|
// void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
// {
|
||||||
|
// // 分配设备内存
|
||||||
|
// int *d_M, *d_colsum;
|
||||||
|
// cudaMalloc(&d_M, N * N * sizeof(int));
|
||||||
|
// cudaMalloc(&d_colsum, N * sizeof(int));
|
||||||
|
//
|
||||||
|
// // 将数据从主机复制到设备
|
||||||
|
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||||
|
//
|
||||||
|
// // 定义CUDA核函数
|
||||||
|
// dim3 blockDim(256);
|
||||||
|
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||||
|
//
|
||||||
|
// // 启动核函数
|
||||||
|
// cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
|
||||||
|
//
|
||||||
|
// // 将结果从设备复制回主机
|
||||||
|
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
//
|
||||||
|
// // 释放设备内存
|
||||||
|
// cudaFree(d_M);
|
||||||
|
// cudaFree(d_colsum);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// /* CUDA优化的行列求和函数 */
|
||||||
|
// void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||||
|
// {
|
||||||
|
// // 分配设备内存
|
||||||
|
// int *d_M, *d_rowsum, *d_colsum;
|
||||||
|
// cudaMalloc(&d_M, N * N * sizeof(int));
|
||||||
|
// cudaMalloc(&d_rowsum, N * sizeof(int));
|
||||||
|
// cudaMalloc(&d_colsum, N * sizeof(int));
|
||||||
|
//
|
||||||
|
// // 将数据从主机复制到设备
|
||||||
|
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||||
|
//
|
||||||
|
// // 定义CUDA核函数
|
||||||
|
// dim3 blockDim(256);
|
||||||
|
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||||
|
//
|
||||||
|
// // 启动核函数
|
||||||
|
// cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
|
||||||
|
//
|
||||||
|
// // 将结果从设备复制回主机
|
||||||
|
// cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
//
|
||||||
|
// // 释放设备内存
|
||||||
|
// cudaFree(d_M);
|
||||||
|
// cudaFree(d_rowsum);
|
||||||
|
// cudaFree(d_colsum);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// /* CUDA核函数 - 列求和 */
|
||||||
|
// __global__ void cudaColumnSum(int *M, int *colsum)
|
||||||
|
// {
|
||||||
|
// int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
// if (col < N) {
|
||||||
|
// colsum[col] = 0;
|
||||||
|
// for (int row = 0; row < N; row++) {
|
||||||
|
// colsum[col] += M[row * N + col];
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// /* CUDA核函数 - 行列求和 */
|
||||||
|
// __global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
|
||||||
|
// {
|
||||||
|
// int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
// if (idx < N) {
|
||||||
|
// // 计算行和
|
||||||
|
// rowsum[idx] = 0;
|
||||||
|
// for (int j = 0; j < N; j++) {
|
||||||
|
// rowsum[idx] += M[idx * N + j];
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// // 计算列和
|
||||||
|
// colsum[idx] = 0;
|
||||||
|
// for (int i = 0; i < N; i++) {
|
||||||
|
// colsum[idx] += M[i * N + idx];
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// /*
|
||||||
|
// 这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串")
|
||||||
|
// COL表示该函数仅仅计算每一列的和
|
||||||
|
// ROWCOL表示该函数计算每一行、每一列的和
|
||||||
|
// 将你认为最好的两个实现,放在最前面。
|
||||||
|
// 比如:
|
||||||
|
// {my_c_sum1, "超级垃圾列求和实现"},
|
||||||
|
// {my_rc_sum2, "好一点的行列求和实现"},
|
||||||
|
// */
|
||||||
|
//
|
||||||
|
// rc_fun_rec rc_fun_tab[] =
|
||||||
|
// {
|
||||||
|
//
|
||||||
|
// /* 第一项,应当是你写的最好列求和的函数实现 */
|
||||||
|
// {cuda_c_sum, COL, "CUDA optimized column sum"},
|
||||||
|
// /* 第二项,应当是你写的最好行列求和的函数实现 */
|
||||||
|
// {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
|
||||||
|
//
|
||||||
|
// {c_sum, COL, "Column sum, reference implementation"},
|
||||||
|
//
|
||||||
|
// {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||||
|
//
|
||||||
|
// /* 下面的代码不能修改或者删除!!表明数组列表结束 */
|
||||||
|
// {NULL,ROWCOL,NULL}
|
||||||
|
// };
|
||||||
69
perflab/matrix/rowcol_202302723005.c
Normal file
69
perflab/matrix/rowcol_202302723005.c
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
/**************************************************************************
|
||||||
|
??/???????????????????????????????
|
||||||
|
1. ???????????????????????????????
|
||||||
|
2. ??????????????????????
|
||||||
|
3. ??rc_fun_rec rc_fun_tab??????????????????
|
||||||
|
???????????????????????????????????????????
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
/*
|
||||||
|
????201209054233
|
||||||
|
??????????????
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "rowcol.h"
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
/* ????????????????? */
|
||||||
|
/* ???????????????????????????????????????????????
|
||||||
|
??????????2?????????????????
|
||||||
|
*/
|
||||||
|
|
||||||
|
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
|
||||||
|
int i, j;
|
||||||
|
for (j = 0; j < N; j++) {
|
||||||
|
colsum[j] = 0;
|
||||||
|
for (i = 0; i < N; i++)
|
||||||
|
colsum[j] += M[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ???????????????????? */
|
||||||
|
/* ??????????????????????? */
|
||||||
|
|
||||||
|
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
|
||||||
|
int i, j;
|
||||||
|
for (i = 0; i < N; i++) {
|
||||||
|
rowsum[i] = colsum[i] = 0;
|
||||||
|
for (j = 0; j < N; j++) {
|
||||||
|
rowsum[i] += M[i][j];
|
||||||
|
colsum[i] += M[j][i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
????????????????????????????????????????, COL/ROWCOL, "?????????"??
|
||||||
|
COL??????????????????????
|
||||||
|
ROWCOL???????????????????????
|
||||||
|
?????????????????????????????
|
||||||
|
????
|
||||||
|
{my_c_sum1, "?????????????????"},
|
||||||
|
{my_rc_sum2, "??????????????????"},
|
||||||
|
*/
|
||||||
|
|
||||||
|
rc_fun_rec rc_fun_tab[] = {
|
||||||
|
|
||||||
|
/* ???????????????????????????????? */
|
||||||
|
{c_sum, COL, "Best column sum"},
|
||||||
|
/* ?????????????????????????????????? */
|
||||||
|
{rc_sum, ROWCOL, "Best row and column sum"},
|
||||||
|
|
||||||
|
{c_sum, COL, "Column sum, reference implementation"},
|
||||||
|
|
||||||
|
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||||
|
|
||||||
|
/* ??????????????????????????????????????? */
|
||||||
|
{NULL, ROWCOL, NULL}};
|
||||||
BIN
perflab/matrix/rowcol_202302723005.o
Normal file
BIN
perflab/matrix/rowcol_202302723005.o
Normal file
Binary file not shown.
@@ -1,9 +1,9 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
//#include <random.h>
|
// #include <random.h>
|
||||||
#include "rowcol.h"
|
|
||||||
#include "fcyc.h"
|
|
||||||
#include "clock.h"
|
#include "clock.h"
|
||||||
|
#include "fcyc.h"
|
||||||
|
#include "rowcol.h"
|
||||||
|
|
||||||
#define MAX_ITER_COUNT 100
|
#define MAX_ITER_COUNT 100
|
||||||
|
|
||||||
@@ -11,9 +11,9 @@
|
|||||||
static struct {
|
static struct {
|
||||||
double cref; /* Cycles taken by reference solution */
|
double cref; /* Cycles taken by reference solution */
|
||||||
double cbest; /* Cycles taken by our best implementation */
|
double cbest; /* Cycles taken by our best implementation */
|
||||||
} cstandard[2] =
|
} cstandard[2] = {
|
||||||
{{7.7, 6.40}, /* Column Sum */
|
{7.7, 6.40}, /* Column Sum */
|
||||||
{9.75, 6.60} /* Row & Column Sum */
|
{9.75, 6.60} /* Row & Column Sum */
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Put in code to align matrix so that it starts on a cache block boundary.
|
/* Put in code to align matrix so that it starts on a cache block boundary.
|
||||||
@@ -26,7 +26,7 @@ static struct {
|
|||||||
#define WPB 16
|
#define WPB 16
|
||||||
|
|
||||||
int verbose = 1;
|
int verbose = 1;
|
||||||
int data[N*N+WPB];
|
int data[N * N + WPB];
|
||||||
int *mstart;
|
int *mstart;
|
||||||
|
|
||||||
typedef vector_t *row_t;
|
typedef vector_t *row_t;
|
||||||
@@ -37,137 +37,122 @@ vector_t rsref, csref, rcomp, ccomp;
|
|||||||
static void init_tests(void);
|
static void init_tests(void);
|
||||||
extern void make_CPU_busy(void);
|
extern void make_CPU_busy(void);
|
||||||
|
|
||||||
static void init_tests(void)
|
static void init_tests(void) {
|
||||||
{
|
int i, j;
|
||||||
int i, j;
|
size_t bytes_per_block = sizeof(int) * WPB;
|
||||||
size_t bytes_per_block = sizeof(int) * WPB;
|
/* round mstart up to nearest block boundary */
|
||||||
/* round mstart up to nearest block boundary */
|
mstart = (int *)(((size_t)data + bytes_per_block - 1) / bytes_per_block *
|
||||||
mstart = (int *)
|
bytes_per_block);
|
||||||
(((size_t) data + bytes_per_block-1) / bytes_per_block * bytes_per_block);
|
for (i = 0; i < N; i++) {
|
||||||
for (i = 0; i < N; i++) {
|
rsref[i] = csref[i] = 0;
|
||||||
rsref[i] = csref[i] = 0;
|
}
|
||||||
}
|
for (i = 0; i < N; i++) {
|
||||||
for (i = 0; i < N; i++) {
|
for (j = 0; j < N; j++) {
|
||||||
for (j = 0; j < N; j++) {
|
int val = rand();
|
||||||
int val = rand();
|
mstart[i * N + j] = val;
|
||||||
mstart[i*N+j] = val;
|
rsref[i] += val;
|
||||||
rsref[i] += val;
|
csref[j] += val;
|
||||||
csref[j] += val;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Test function on all values */
|
/* Test function on all values */
|
||||||
int test_rc(rc_fun f, FILE *rpt, rc_comp_t rc_type) {
|
int test_rc(rc_fun f, FILE *rpt, rc_comp_t rc_type) {
|
||||||
int i;
|
int i;
|
||||||
int ok = 1;
|
int ok = 1;
|
||||||
|
|
||||||
for (i = 0; i < N; i++)
|
for (i = 0; i < N; i++)
|
||||||
rcomp[i] = ccomp[i] = 0xDEADBEEF;
|
rcomp[i] = ccomp[i] = 0xDEADBEEF;
|
||||||
f((row_t)mstart, rcomp, ccomp);
|
f((row_t)mstart, rcomp, ccomp);
|
||||||
|
|
||||||
for (i = 0; ok && i < N; i++) {
|
|
||||||
if (rc_type == ROWCOL
|
|
||||||
&& rsref[i] != rcomp[i]) {
|
|
||||||
ok = 0;
|
|
||||||
if (rpt)
|
|
||||||
fprintf(rpt,
|
|
||||||
"对第%d行的计算出错!正确结果是%d,但是计算得到%d\n",
|
|
||||||
i, rsref[i], rcomp[i]);
|
|
||||||
}
|
|
||||||
if ((rc_type == ROWCOL || rc_type == COL)
|
|
||||||
&& csref[i] != ccomp[i]) {
|
|
||||||
ok = 0;
|
|
||||||
if (rpt)
|
|
||||||
fprintf(rpt,
|
|
||||||
"对第%d列的计算出错!正确结果是%d,但是计算得到%d\n",
|
|
||||||
i, csref[i], ccomp[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
for (i = 0; ok && i < N; i++) {
|
||||||
|
if (rc_type == ROWCOL && rsref[i] != rcomp[i]) {
|
||||||
|
ok = 0;
|
||||||
|
if (rpt)
|
||||||
|
fprintf(rpt, "对第%d行的计算出错!正确结果是%d,但是计算得到%d\n", i,
|
||||||
|
rsref[i], rcomp[i]);
|
||||||
}
|
}
|
||||||
return ok;
|
if ((rc_type == ROWCOL || rc_type == COL) && csref[i] != ccomp[i]) {
|
||||||
|
ok = 0;
|
||||||
|
if (rpt)
|
||||||
|
fprintf(rpt, "对第%d列的计算出错!正确结果是%d,但是计算得到%d\n", i,
|
||||||
|
csref[i], ccomp[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Kludgy way to interface to cycle measuring code */
|
/* Kludgy way to interface to cycle measuring code */
|
||||||
void do_test(int *intf)
|
void do_test(int *intf) {
|
||||||
{
|
rc_fun f = (rc_fun)intf;
|
||||||
rc_fun f = (rc_fun) intf;
|
|
||||||
f((row_t)mstart, rcomp, ccomp);
|
f((row_t)mstart, rcomp, ccomp);
|
||||||
}
|
}
|
||||||
|
|
||||||
void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp)
|
void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp) {
|
||||||
{
|
int i;
|
||||||
int i;
|
int *intf = (int *)f;
|
||||||
int *intf = (int *) f;
|
|
||||||
double t, cme;
|
double t, cme;
|
||||||
t = 0;
|
t = 0;
|
||||||
if (verbose) printf("函数:%s\n", descr);
|
if (verbose)
|
||||||
|
printf("函数:%s\n", descr);
|
||||||
if (test_rc(f, stdout, rc_type)) {
|
if (test_rc(f, stdout, rc_type)) {
|
||||||
make_CPU_busy();
|
make_CPU_busy();
|
||||||
for (i=0;i<MAX_ITER_COUNT;i++)
|
for (i = 0; i < MAX_ITER_COUNT; i++)
|
||||||
t += fcyc(do_test, intf);
|
t += fcyc((void (*)(long *))do_test, intf);
|
||||||
t = t/MAX_ITER_COUNT;
|
t = t / MAX_ITER_COUNT;
|
||||||
cme = t/(N*N);
|
cme = t / (N * N);
|
||||||
if (verbose) printf(" 总周期数 = %.2f, 平均周期/元素 = %.2f\n",
|
if (verbose)
|
||||||
t, cme);
|
printf(" 总周期数 = %.2f, 平均周期/元素 = %.2f\n", t, cme);
|
||||||
if (cycp)
|
if (cycp)
|
||||||
*cycp = cme;
|
*cycp = cme;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Compute the grade achieved by function */
|
/* Compute the grade achieved by function */
|
||||||
static double compute_score(double cmeas, double cref, double cbest)
|
static double compute_score(double cmeas, double cref, double cbest) {
|
||||||
{
|
double sbest = cref / cbest;
|
||||||
double sbest = cref/cbest;
|
double smeas = cref / cmeas;
|
||||||
double smeas = cref/cmeas;
|
if (smeas < 0.1 * (sbest - 1) + 1)
|
||||||
if (smeas < 0.1*(sbest-1)+1)
|
|
||||||
return 0;
|
return 0;
|
||||||
if (smeas > 1.1*(sbest-1)+1)
|
if (smeas > 1.1 * (sbest - 1) + 1)
|
||||||
return 120;
|
return 120;
|
||||||
return 100*((smeas-1.0)/(sbest-1.0) + 0.1);
|
return 100 * ((smeas - 1.0) / (sbest - 1.0) + 0.1);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[])
|
int main(int argc, char *argv[]) {
|
||||||
{
|
|
||||||
int i;
|
int i;
|
||||||
double cme;
|
double cme;
|
||||||
double cme_c,cme_rc;
|
double cme_c, cme_rc;
|
||||||
int EnableScore=0;
|
int EnableScore = 0;
|
||||||
|
|
||||||
if (argc == 3)
|
if (argc == 3) {
|
||||||
{
|
EnableScore = 1;
|
||||||
EnableScore = 1;
|
verbose = 0;
|
||||||
verbose = 0;
|
|
||||||
}
|
}
|
||||||
init_tests();
|
init_tests();
|
||||||
set_fcyc_clear_cache(1); /* Set so that clears cache between runs */
|
set_fcyc_clear_cache(1); /* Set so that clears cache between runs */
|
||||||
for (i = 0; rc_fun_tab[i].f != NULL; i++) {
|
for (i = 0; rc_fun_tab[i].f != NULL; i++) {
|
||||||
cme = 100.0;
|
cme = 100.0;
|
||||||
time_rc(rc_fun_tab[i].f,
|
time_rc(rc_fun_tab[i].f, rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme);
|
||||||
rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme);
|
if (i == 0) {
|
||||||
if (i == 0)
|
cme_c = cme;
|
||||||
{
|
if (EnableScore == 0) {
|
||||||
cme_c = cme;
|
printf(" 最高\"列求和\"得分 ======================== %.0f\n",
|
||||||
if (EnableScore==0)
|
compute_score(cme, cstandard[0].cref, cstandard[0].cbest));
|
||||||
{
|
}
|
||||||
printf(" 最高\"列求和\"得分 ======================== %.0f\n",
|
}
|
||||||
compute_score(cme, cstandard[0].cref, cstandard[0].cbest));
|
if (i == 1) {
|
||||||
}
|
cme_rc = cme;
|
||||||
}
|
if (EnableScore == 0) {
|
||||||
if (i == 1)
|
printf(" 最高\"行和列求和\"得分 ====================== %.0f\n",
|
||||||
{
|
compute_score(cme, cstandard[1].cref, cstandard[1].cbest));
|
||||||
cme_rc = cme;
|
}
|
||||||
if (EnableScore==0)
|
}
|
||||||
{
|
|
||||||
printf(" 最高\"行和列求和\"得分 ====================== %.0f\n",
|
|
||||||
compute_score(cme, cstandard[1].cref, cstandard[1].cbest));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EnableScore)
|
if (EnableScore)
|
||||||
printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n",cme_c,compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest),
|
printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n", cme_c,
|
||||||
cme_rc,compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest));
|
compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest), cme_rc,
|
||||||
|
compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest));
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
BIN
perflab/matrix/rowcol_test.o
Normal file
BIN
perflab/matrix/rowcol_test.o
Normal file
Binary file not shown.
35
perflab/poly/Makefile
Normal file
35
perflab/poly/Makefile
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
CC = gcc
|
||||||
|
NVCC = nvcc
|
||||||
|
CFLAGS = -Wall -O2 -g
|
||||||
|
CUDA_FLAGS = -O2 -g
|
||||||
|
LDFLAGS = -lm -lcudart
|
||||||
|
|
||||||
|
# Source files
|
||||||
|
SRCS = poly_test.c clock.c cpe.c fcyc.c lsquare.c
|
||||||
|
CUDA_SRCS = poly.cu
|
||||||
|
OBJS = $(SRCS:.c=.o) poly.o
|
||||||
|
|
||||||
|
# Target executable
|
||||||
|
TARGET = poly_test
|
||||||
|
|
||||||
|
# Default target
|
||||||
|
all: $(TARGET)
|
||||||
|
|
||||||
|
# Rule to build the executable
|
||||||
|
$(TARGET): $(OBJS)
|
||||||
|
$(CC) $(OBJS) -o $(TARGET) $(LDFLAGS)
|
||||||
|
|
||||||
|
# Rule to build object files
|
||||||
|
%.o: %.c
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
# Rule to build CUDA object files
|
||||||
|
poly.o: poly.cu
|
||||||
|
$(NVCC) $(CUDA_FLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
# Clean rule
|
||||||
|
clean:
|
||||||
|
rm -f $(OBJS) $(TARGET)
|
||||||
|
|
||||||
|
# Phony targets
|
||||||
|
.PHONY: all clean
|
||||||
@@ -13,11 +13,11 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <intrin.h>
|
#include <x86intrin.h>
|
||||||
//#include <intrinsics.h>
|
// #include <intrinsics.h>
|
||||||
#include <windows.h>
|
|
||||||
#include <time.h>
|
|
||||||
#include "clock.h"
|
#include "clock.h"
|
||||||
|
#include <time.h>
|
||||||
|
#include <windows.h>
|
||||||
|
|
||||||
/* Use x86 cycle counter */
|
/* Use x86 cycle counter */
|
||||||
|
|
||||||
@@ -27,203 +27,195 @@ static unsigned cyc_lo = 0;
|
|||||||
|
|
||||||
/* Set *hi and *lo to the high and low order bits of the cycle counter.
|
/* Set *hi and *lo to the high and low order bits of the cycle counter.
|
||||||
Implementation requires assembly code to use the rdtsc instruction. */
|
Implementation requires assembly code to use the rdtsc instruction. */
|
||||||
void access_counter(unsigned *hi, unsigned *lo)
|
void access_counter(unsigned *hi, unsigned *lo) {
|
||||||
{
|
|
||||||
|
|
||||||
long long counter;
|
long long counter;
|
||||||
|
|
||||||
counter = __rdtsc();
|
counter = __rdtsc();
|
||||||
(*hi) = (unsigned int)(counter >> 32);
|
(*hi) = (unsigned int)(counter >> 32);
|
||||||
(*lo) = (unsigned int)counter;
|
(*lo) = (unsigned int)counter;
|
||||||
/*
|
/*
|
||||||
|
|
||||||
LARGE_INTEGER lPerformanceCount;
|
LARGE_INTEGER lPerformanceCount;
|
||||||
|
|
||||||
QueryPerformanceCounter(&lPerformanceCount);
|
QueryPerformanceCounter(&lPerformanceCount);
|
||||||
(*hi) = (unsigned int)lPerformanceCount.HighPart;
|
(*hi) = (unsigned int)lPerformanceCount.HighPart;
|
||||||
(*lo) = (unsigned int)lPerformanceCount.LowPart;
|
(*lo) = (unsigned int)lPerformanceCount.LowPart;
|
||||||
// printf("%08X %08X\n",(*hi),(*lo));
|
// printf("%08X %08X\n",(*hi),(*lo));
|
||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Record the current value of the cycle counter. */
|
/* Record the current value of the cycle counter. */
|
||||||
void start_counter()
|
void start_counter() { access_counter(&cyc_hi, &cyc_lo); }
|
||||||
{
|
|
||||||
access_counter(&cyc_hi, &cyc_lo);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Return the number of cycles since the last call to start_counter. */
|
/* Return the number of cycles since the last call to start_counter. */
|
||||||
double get_counter()
|
double get_counter() {
|
||||||
{
|
unsigned ncyc_hi, ncyc_lo;
|
||||||
unsigned ncyc_hi, ncyc_lo;
|
unsigned hi, lo, borrow;
|
||||||
unsigned hi, lo, borrow;
|
double result;
|
||||||
double result;
|
|
||||||
|
|
||||||
/* Get cycle counter */
|
/* Get cycle counter */
|
||||||
access_counter(&ncyc_hi, &ncyc_lo);
|
access_counter(&ncyc_hi, &ncyc_lo);
|
||||||
|
|
||||||
/* Do double precision subtraction */
|
/* Do double precision subtraction */
|
||||||
lo = ncyc_lo - cyc_lo;
|
lo = ncyc_lo - cyc_lo;
|
||||||
borrow = cyc_lo > ncyc_lo;
|
borrow = cyc_lo > ncyc_lo;
|
||||||
hi = ncyc_hi - cyc_hi - borrow;
|
hi = ncyc_hi - cyc_hi - borrow;
|
||||||
result = (double) hi * (1 << 30) * 4 + lo;
|
result = (double)hi * (1 << 30) * 4 + lo;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
void make_CPU_busy(void)
|
void make_CPU_busy(void) {
|
||||||
{
|
volatile double old_tick, new_tick;
|
||||||
volatile double old_tick,new_tick;
|
start_counter();
|
||||||
start_counter();
|
old_tick = get_counter();
|
||||||
old_tick = get_counter();
|
new_tick = get_counter();
|
||||||
new_tick = get_counter();
|
while (new_tick - old_tick < 1000000000)
|
||||||
while (new_tick - old_tick < 1000000000)
|
new_tick = get_counter();
|
||||||
new_tick = get_counter();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//CPU的频率
|
// CPU<EFBFBD><EFBFBD>Ƶ<EFBFBD><EFBFBD>
|
||||||
double mhz(int verbose)
|
double mhz(int verbose) {
|
||||||
{
|
LARGE_INTEGER lFrequency;
|
||||||
LARGE_INTEGER lFrequency;
|
LARGE_INTEGER lPerformanceCount_Start;
|
||||||
LARGE_INTEGER lPerformanceCount_Start;
|
LARGE_INTEGER lPerformanceCount_End;
|
||||||
LARGE_INTEGER lPerformanceCount_End;
|
double mhz;
|
||||||
double mhz;
|
double fTime;
|
||||||
double fTime;
|
__int64 _i64StartCpuCounter;
|
||||||
__int64 _i64StartCpuCounter;
|
__int64 _i64EndCpuCounter;
|
||||||
__int64 _i64EndCpuCounter;
|
// On a multiprocessor machine, it should not matter which processor is
|
||||||
//On a multiprocessor machine, it should not matter which processor is called.
|
// called. However, you can get different results on different processors due
|
||||||
//However, you can get different results on different processors due to bugs in
|
// to bugs in the BIOS or the HAL. To specify processor affinity for a thread,
|
||||||
//the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function.
|
// use the SetThreadAffinityMask function.
|
||||||
HANDLE hThread=GetCurrentThread();
|
HANDLE hThread = GetCurrentThread();
|
||||||
SetThreadAffinityMask(hThread,0x1);
|
SetThreadAffinityMask(hThread, 0x1);
|
||||||
|
|
||||||
//主板上高精度定时器的晶振频率
|
// <20><><EFBFBD><EFBFBD><EFBFBD>ϸ߾<CFB8><DFBE>ȶ<EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD>ľ<EFBFBD><C4BE><EFBFBD>Ƶ<EFBFBD><C6B5>
|
||||||
//这个定时器应该就是一片8253或者8254
|
// <20><><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1>Ӧ<EFBFBD>þ<EFBFBD><C3BE><EFBFBD>һƄ1<C684>78253<35><33><EFBFBD><EFBFBD>8254
|
||||||
//在intel ich7中集成了8254
|
// <20><>intel ich7<EFBFBD>м<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>8254
|
||||||
QueryPerformanceFrequency(&lFrequency);
|
QueryPerformanceFrequency(&lFrequency);
|
||||||
// if (verbose>0)
|
// if (verbose>0)
|
||||||
// printf("高精度定时器的晶振频率:%1.0fHz.\n",(double)lFrequency.QuadPart);
|
// printf("<EFBFBD>߾<EFBFBD><EFBFBD>ȶ<EFBFBD>ʱ<EFBFBD><EFBFBD><EFBFBD>ľ<EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD>ʣ<EFBFBD>%1.0fHz.\n",(double)lFrequency.QuadPart);
|
||||||
|
|
||||||
//这个定时器每经过一个时钟周期,其计数器会+1
|
// <20><><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1>ÿ<EFBFBD><C3BF><EFBFBD><EFBFBD>һ<EFBFBD><D2BB>ʱ<EFBFBD><CAB1><EFBFBD><EFBFBD><EFBFBD>ڣ<EFBFBD><DAA3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>+1
|
||||||
QueryPerformanceCounter(&lPerformanceCount_Start);
|
QueryPerformanceCounter(&lPerformanceCount_Start);
|
||||||
|
|
||||||
//RDTSC指令:获取CPU经历的时钟周期数
|
// RDTSCָ<EFBFBD><EFBFBD>:<3A><>ȡCPU<50><55><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||||
_i64StartCpuCounter=__rdtsc();
|
_i64StartCpuCounter = __rdtsc();
|
||||||
|
|
||||||
//延时长一点,误差会小一点
|
// <20><>ʱ<EFBFBD><CAB1>һ<EFBFBD><D2BB>,<2C><><EFBFBD><EFBFBD>Сһ<D0A1><D2BB>
|
||||||
//int nTemp=100000;
|
// int nTemp=100000;
|
||||||
//while (--nTemp);
|
// while (--nTemp);
|
||||||
Sleep(200);
|
Sleep(200);
|
||||||
|
|
||||||
QueryPerformanceCounter(&lPerformanceCount_End);
|
QueryPerformanceCounter(&lPerformanceCount_End);
|
||||||
|
|
||||||
_i64EndCpuCounter=__rdtsc();
|
_i64EndCpuCounter = __rdtsc();
|
||||||
|
|
||||||
//f=1/T => f=计数次数/(计数次数*T)
|
// f=1/T => f=<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>/(<28><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*T)
|
||||||
//这里的“计数次数*T”就是时间差
|
// <20><><EFBFBD><EFBFBD>ġ<EFBFBD><C4A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ᅣ1<EFBF84>7*T<><54><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1>ᅣ1<EFBF84>7
|
||||||
fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart)
|
fTime = ((double)lPerformanceCount_End.QuadPart -
|
||||||
/(double)lFrequency.QuadPart;
|
(double)lPerformanceCount_Start.QuadPart) /
|
||||||
|
(double)lFrequency.QuadPart;
|
||||||
|
|
||||||
mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0);
|
mhz = (_i64EndCpuCounter - _i64StartCpuCounter) / (fTime * 1000000.0);
|
||||||
if (verbose>0)
|
if (verbose > 0)
|
||||||
printf("CPU频率为:%1.6fMHz.\n",mhz);
|
printf("CPUƵ<EFBFBD><EFBFBD>Ϊ:%1.6fMHz.\n", mhz);
|
||||||
return mhz;
|
return mhz;
|
||||||
}
|
}
|
||||||
|
|
||||||
double CPU_Factor1(void)
|
double CPU_Factor1(void) {
|
||||||
{
|
double result;
|
||||||
double result;
|
int i, j, k, ii, jj, kk;
|
||||||
int i,j,k,ii,jj,kk;
|
LARGE_INTEGER lStart, lEnd;
|
||||||
LARGE_INTEGER lStart,lEnd;
|
|
||||||
LARGE_INTEGER lFrequency;
|
LARGE_INTEGER lFrequency;
|
||||||
HANDLE hThread;
|
HANDLE hThread;
|
||||||
double fTime;
|
double fTime;
|
||||||
|
|
||||||
QueryPerformanceFrequency(&lFrequency);
|
QueryPerformanceFrequency(&lFrequency);
|
||||||
|
|
||||||
ii = 43273;
|
ii = 43273;
|
||||||
kk = 1238;
|
kk = 1238;
|
||||||
result = 1;
|
result = 1;
|
||||||
jj = 1244;
|
jj = 1244;
|
||||||
|
|
||||||
hThread=GetCurrentThread();
|
hThread = GetCurrentThread();
|
||||||
SetThreadAffinityMask(hThread,0x1);
|
SetThreadAffinityMask(hThread, 0x1);
|
||||||
QueryPerformanceCounter(&lStart);
|
QueryPerformanceCounter(&lStart);
|
||||||
//_asm("cpuid");
|
//_asm("cpuid");
|
||||||
start_counter();
|
start_counter();
|
||||||
for (i=0;i<100;i++)
|
for (i = 0; i < 100; i++)
|
||||||
for (j=0;j<1000;j++)
|
for (j = 0; j < 1000; j++)
|
||||||
for (k=0;k<1000;k++)
|
for (k = 0; k < 1000; k++)
|
||||||
kk += kk*ii+jj;
|
kk += kk * ii + jj;
|
||||||
|
|
||||||
result = get_counter();
|
result = get_counter();
|
||||||
QueryPerformanceCounter(&lEnd);
|
QueryPerformanceCounter(&lEnd);
|
||||||
fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart);
|
fTime = ((double)lEnd.QuadPart - (double)lStart.QuadPart);
|
||||||
printf("CPU运行时间为%f",result);
|
printf("CPU<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>Ϊ%f", result);
|
||||||
printf("\t %f\n",fTime);
|
printf("\t %f\n", fTime);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
double CPU_Factor(void)
|
double CPU_Factor(void) {
|
||||||
{
|
double frequency;
|
||||||
double frequency;
|
double multiplier = 1000 * 1000 * 1000; // nano
|
||||||
double multiplier = 1000 * 1000 * 1000;//nano
|
LARGE_INTEGER lFrequency;
|
||||||
LARGE_INTEGER lFrequency;
|
LARGE_INTEGER start, stop;
|
||||||
LARGE_INTEGER start,stop;
|
HANDLE hThread;
|
||||||
HANDLE hThread;
|
int i;
|
||||||
int i;
|
const int gigahertz = 1000 * 1000 * 1000;
|
||||||
const int gigahertz= 1000*1000*1000;
|
const int known_instructions_per_loop = 27317;
|
||||||
const int known_instructions_per_loop = 27317;
|
|
||||||
|
|
||||||
int iterations = 100000000;
|
int iterations = 100000000;
|
||||||
int g = 0;
|
int g = 0;
|
||||||
double normal_ticks_per_second;
|
double normal_ticks_per_second;
|
||||||
double ticks;
|
double ticks;
|
||||||
double time;
|
double time;
|
||||||
double loops_per_sec;
|
double loops_per_sec;
|
||||||
double instructions_per_loop;
|
double instructions_per_loop;
|
||||||
double ratio;
|
double ratio;
|
||||||
double actual_freq;
|
double actual_freq;
|
||||||
|
|
||||||
QueryPerformanceFrequency(&lFrequency);
|
QueryPerformanceFrequency(&lFrequency);
|
||||||
frequency = (double)lFrequency.QuadPart;
|
frequency = (double)lFrequency.QuadPart;
|
||||||
|
|
||||||
hThread=GetCurrentThread();
|
hThread = GetCurrentThread();
|
||||||
SetThreadAffinityMask(hThread,0x1);
|
SetThreadAffinityMask(hThread, 0x1);
|
||||||
QueryPerformanceCounter(&start);
|
QueryPerformanceCounter(&start);
|
||||||
for( i = 0; i < iterations; i++)
|
for (i = 0; i < iterations; i++) {
|
||||||
{
|
g++;
|
||||||
g++;
|
g++;
|
||||||
g++;
|
g++;
|
||||||
g++;
|
g++;
|
||||||
g++;
|
}
|
||||||
}
|
QueryPerformanceCounter(&stop);
|
||||||
QueryPerformanceCounter(&stop);
|
|
||||||
|
|
||||||
//normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199
|
// normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ
|
||||||
normal_ticks_per_second = frequency * 1000;
|
// 3199
|
||||||
ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
|
normal_ticks_per_second = frequency * 1000;
|
||||||
time = (ticks * multiplier) /frequency;
|
ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
|
||||||
loops_per_sec = iterations / (time/multiplier);
|
time = (ticks * multiplier) / frequency;
|
||||||
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
|
loops_per_sec = iterations / (time / multiplier);
|
||||||
|
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
|
||||||
|
|
||||||
ratio = (instructions_per_loop / known_instructions_per_loop);
|
ratio = (instructions_per_loop / known_instructions_per_loop);
|
||||||
actual_freq = normal_ticks_per_second / ratio;
|
actual_freq = normal_ticks_per_second / ratio;
|
||||||
/*
|
/*
|
||||||
actual_freq = normal_ticks_per_second / ratio;
|
actual_freq = normal_ticks_per_second / ratio;
|
||||||
actual_freq = known_instructions_per_loop*iterations*multiplier/time;
|
actual_freq = known_instructions_per_loop*iterations*multiplier/time;
|
||||||
|
|
||||||
2293 = x/time;
|
2293 = x/time;
|
||||||
|
|
||||||
2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
|
2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
|
||||||
loops_per_sec = iterations*frequency / ticks
|
loops_per_sec = iterations*frequency / ticks
|
||||||
|
|
||||||
instructions_per_loop = / loops_per_sec;
|
instructions_per_loop = / loops_per_sec;
|
||||||
*/
|
*/
|
||||||
printf("Perf counter freq: %f\n", normal_ticks_per_second);
|
printf("Perf counter freq: %f\n", normal_ticks_per_second);
|
||||||
printf("Loops per sec: %f\n", loops_per_sec);
|
printf("Loops per sec: %f\n", loops_per_sec);
|
||||||
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
|
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
|
||||||
printf("Presumed freq: %f\n", actual_freq);
|
printf("Presumed freq: %f\n", actual_freq);
|
||||||
printf("ratio: %f\n", ratio);
|
printf("ratio: %f\n", ratio);
|
||||||
printf("time=%f\n",time);
|
printf("time=%f\n", time);
|
||||||
return ratio;
|
return ratio;
|
||||||
}
|
}
|
||||||
|
|||||||
325
perflab/poly/poly.cu
Normal file
325
perflab/poly/poly.cu
Normal file
@@ -0,0 +1,325 @@
|
|||||||
|
/**************************************************************************
|
||||||
|
多项式计算函数。按下面的要求编辑此文件:
|
||||||
|
1. 将你的学号、姓名,以注释的方式写到下面;
|
||||||
|
2. 实现不同版本的多项式计算函数;
|
||||||
|
3. 编辑peval_fun_rec peval_fun_tab数组,将你的最好的答案
|
||||||
|
(最小CPE、最小C10)作为数组的前两项
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
/*
|
||||||
|
学号:201209054233
|
||||||
|
姓名:夜半加班狂
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
typedef int (*peval_fun)(int*, int, int);
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
peval_fun f;
|
||||||
|
char *descr;
|
||||||
|
} peval_fun_rec, *peval_fun_ptr;
|
||||||
|
|
||||||
|
|
||||||
|
/**************************************************************************
|
||||||
|
Edit this comment to indicate your name and Andrew ID
|
||||||
|
#ifdef ASSIGN
|
||||||
|
Submission by Harry Q. Bovik, bovik@andrew.cmu.edu
|
||||||
|
#else
|
||||||
|
Instructor's version.
|
||||||
|
Created by Randal E. Bryant, Randy.Bryant@cs.cmu.edu, 10/07/02
|
||||||
|
#endif
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
/*
|
||||||
|
实现一个指定的常系数多项式计算
|
||||||
|
第一次,请直接运行程序,以便获知你需要实现的常系数是啥
|
||||||
|
*/
|
||||||
|
int const_poly_eval(int *not_use, int not_use2, int x)
|
||||||
|
{
|
||||||
|
int result = 0;
|
||||||
|
/* int i;
|
||||||
|
int xpwr = 1; // x的幂次
|
||||||
|
int a[4] = {21,90,42,88};
|
||||||
|
for (i = 0; i <= 3; i++) {
|
||||||
|
result += a[i]*xpwr;
|
||||||
|
xpwr *= x;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
// 90 = 64 + 32 - 4 - 2
|
||||||
|
// 42 = 32 + 8 + 2
|
||||||
|
// 88 = 64 + 16 + 8
|
||||||
|
int x64,x32,x16,x8,x4,x2;
|
||||||
|
|
||||||
|
x64 = x << 6;
|
||||||
|
x32 = x << 5;
|
||||||
|
x16 = x << 4;
|
||||||
|
x8 = x << 3;
|
||||||
|
x4 = x << 2;
|
||||||
|
x2 = x << 1;
|
||||||
|
result = 21 + x64+x32-x4-x2 + ((x32+x8+x2) + (x64+x16+x8)*x)*x;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* 多项式计算函数。注意:这个只是一个参考实现,你需要实现自己的版本 */
|
||||||
|
|
||||||
|
/*
|
||||||
|
友情提示:lcc支持ATT格式的嵌入式汇编,例如
|
||||||
|
|
||||||
|
_asm("movl %eax,%ebx");
|
||||||
|
_asm("pushl %edx");
|
||||||
|
|
||||||
|
可以在lcc中project->configuration->Compiler->Code Generation->Generate .asm,
|
||||||
|
将其选中后,可以在lcc目录下面生成对应程序的汇编代码实现。通过查看汇编文件,
|
||||||
|
你可以了解编译器是如何实现你的代码的。有些实现可能非常低效。
|
||||||
|
你可以在适当的地方加入嵌入式汇编,来大幅度提高计算性能。
|
||||||
|
*/
|
||||||
|
|
||||||
|
int poly_eval(int *a, int degree, int x)
|
||||||
|
{
|
||||||
|
int result = 0;
|
||||||
|
int i;
|
||||||
|
int xpwr = 1; /* x的幂次 */
|
||||||
|
// printf("阶=%d\n",degree);
|
||||||
|
for (i = 0; i <= degree; i++) {
|
||||||
|
result += a[i]*xpwr;
|
||||||
|
xpwr *= x;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* CUDA优化的多项式计算函数 - 低CPE版本 */
|
||||||
|
int cuda_poly_eval_low_cpe(int *a, int degree, int x)
|
||||||
|
{
|
||||||
|
// 对于低CPE版本,我们使用CUDA并行计算多项式的各个项
|
||||||
|
// 然后将结果传回主机进行求和
|
||||||
|
|
||||||
|
// 分配设备内存
|
||||||
|
int *d_a, *d_results;
|
||||||
|
cudaError_t err;
|
||||||
|
|
||||||
|
// 分配内存
|
||||||
|
err = cudaMalloc(&d_a, (degree + 1) * sizeof(int));
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
err = cudaMalloc(&d_results, (degree + 1) * sizeof(int));
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||||
|
cudaFree(d_a);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 将系数从主机复制到设备
|
||||||
|
err = cudaMemcpy(d_a, a, (degree + 1) * sizeof(int), cudaMemcpyHostToDevice);
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||||
|
cudaFree(d_a);
|
||||||
|
cudaFree(d_results);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 定义CUDA核函数
|
||||||
|
dim3 blockDim(256);
|
||||||
|
dim3 gridDim((degree + 1 + blockDim.x - 1) / blockDim.x);
|
||||||
|
|
||||||
|
// 启动核函数
|
||||||
|
cudaPolyEvalLowCPE<<<gridDim, blockDim>>>(d_a, degree, x, d_results);
|
||||||
|
|
||||||
|
// 检查核函数执行错误
|
||||||
|
err = cudaGetLastError();
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||||
|
cudaFree(d_a);
|
||||||
|
cudaFree(d_results);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 分配主机内存用于结果
|
||||||
|
int *h_results = (int *)malloc((degree + 1) * sizeof(int));
|
||||||
|
if (h_results == NULL) {
|
||||||
|
printf("Memory allocation error\n");
|
||||||
|
cudaFree(d_a);
|
||||||
|
cudaFree(d_results);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 将结果从设备复制回主机
|
||||||
|
err = cudaMemcpy(h_results, d_results, (degree + 1) * sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||||
|
free(h_results);
|
||||||
|
cudaFree(d_a);
|
||||||
|
cudaFree(d_results);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 在主机上求和
|
||||||
|
int result = 0;
|
||||||
|
for (int i = 0; i <= degree; i++) {
|
||||||
|
result += h_results[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// 释放内存
|
||||||
|
free(h_results);
|
||||||
|
cudaFree(d_a);
|
||||||
|
cudaFree(d_results);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* CUDA优化的多项式计算函数 - 10阶优化版本 */
|
||||||
|
int cuda_poly_eval_degree10(int *a, int degree, int x)
|
||||||
|
{
|
||||||
|
// 对于10阶多项式,我们可以使用更优化的方法
|
||||||
|
// 使用CUDA并行计算,但针对10阶多项式进行特殊优化
|
||||||
|
|
||||||
|
// 分配设备内存
|
||||||
|
int *d_a, *d_result;
|
||||||
|
cudaError_t err;
|
||||||
|
|
||||||
|
// 分配内存
|
||||||
|
err = cudaMalloc(&d_a, (degree + 1) * sizeof(int));
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
err = cudaMalloc(&d_result, sizeof(int));
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||||
|
cudaFree(d_a);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 将系数从主机复制到设备
|
||||||
|
err = cudaMemcpy(d_a, a, (degree + 1) * sizeof(int), cudaMemcpyHostToDevice);
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||||
|
cudaFree(d_a);
|
||||||
|
cudaFree(d_result);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 定义CUDA核函数
|
||||||
|
dim3 blockDim(256);
|
||||||
|
dim3 gridDim(1); // 只需要一个块,因为我们只需要一个结果
|
||||||
|
|
||||||
|
// 启动核函数
|
||||||
|
cudaPolyEvalDegree10<<<gridDim, blockDim>>>(d_a, degree, x, d_result);
|
||||||
|
|
||||||
|
// 检查核函数执行错误
|
||||||
|
err = cudaGetLastError();
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||||
|
cudaFree(d_a);
|
||||||
|
cudaFree(d_result);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取结果
|
||||||
|
int result;
|
||||||
|
err = cudaMemcpy(&result, d_result, sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||||
|
cudaFree(d_a);
|
||||||
|
cudaFree(d_result);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 释放内存
|
||||||
|
cudaFree(d_a);
|
||||||
|
cudaFree(d_result);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* CUDA核函数 - 低CPE版本 */
|
||||||
|
__global__ void cudaPolyEvalLowCPE(int *a, int degree, int x, int *results)
|
||||||
|
{
|
||||||
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (idx <= degree) {
|
||||||
|
// 计算x的幂
|
||||||
|
int xpwr = 1;
|
||||||
|
for (int i = 0; i < idx; i++) {
|
||||||
|
xpwr *= x;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 计算这一项的结果
|
||||||
|
results[idx] = a[idx] * xpwr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* CUDA核函数 - 10阶优化版本 */
|
||||||
|
__global__ void cudaPolyEvalDegree10(int *a, int degree, int x, int *result)
|
||||||
|
{
|
||||||
|
// 使用共享内存来存储中间结果
|
||||||
|
__shared__ int shared_result;
|
||||||
|
|
||||||
|
// 只有第一个线程初始化共享结果
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
shared_result = 0;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// 每个线程计算一部分项
|
||||||
|
int local_result = 0;
|
||||||
|
int xpwr = 1;
|
||||||
|
|
||||||
|
// 计算x的幂
|
||||||
|
for (int i = 0; i < threadIdx.x; i++) {
|
||||||
|
xpwr *= x;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 计算这一项的结果
|
||||||
|
if (threadIdx.x <= degree) {
|
||||||
|
local_result = a[threadIdx.x] * xpwr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 使用原子操作累加结果
|
||||||
|
atomicAdd(&shared_result, local_result);
|
||||||
|
|
||||||
|
// 同步所有线程
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// 只有第一个线程将结果写回全局内存
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
*result = shared_result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
这个表格包含多个数组元素,每一组元素(函数名字, "描述字符串")
|
||||||
|
将你认为最好的两个实现,放在最前面。
|
||||||
|
比如:
|
||||||
|
{my_poly_eval1, "超级垃圾实现"},
|
||||||
|
{my_poly_eval2, "好一点的实现"},
|
||||||
|
*/
|
||||||
|
|
||||||
|
peval_fun_rec peval_fun_tab[] =
|
||||||
|
{
|
||||||
|
|
||||||
|
/* 第一项,应当是你写的最好CPE的函数实现 */
|
||||||
|
{cuda_poly_eval_low_cpe, "CUDA optimized low CPE implementation"},
|
||||||
|
/* 第二项,应当是你写的在10阶时具有最好性能的实现 */
|
||||||
|
{cuda_poly_eval_degree10, "CUDA optimized degree 10 implementation"},
|
||||||
|
|
||||||
|
{poly_eval, "poly_eval: 参考实现"},
|
||||||
|
|
||||||
|
/* 下面的代码不能修改或者删除!!表明数组列表结束 */
|
||||||
|
{NULL, ""}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
BIN
perflab/poly/poly.o
Normal file
BIN
perflab/poly/poly.o
Normal file
Binary file not shown.
@@ -6,6 +6,7 @@
|
|||||||
#include "poly.h"
|
#include "poly.h"
|
||||||
#include "cpe.h"
|
#include "cpe.h"
|
||||||
#include "clock.h"
|
#include "clock.h"
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
double CPU_Mhz;
|
double CPU_Mhz;
|
||||||
|
|
||||||
@@ -17,7 +18,7 @@ static int coeff[MAXDEGREE+1];
|
|||||||
|
|
||||||
#define MAX_ITER_COUNT 100
|
#define MAX_ITER_COUNT 100
|
||||||
|
|
||||||
#define REF_CPU_MHZ 2292.6 // 这是我的处理器主频
|
#define REF_CPU_MHZ 2292.6 // <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ҵĴ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ
|
||||||
|
|
||||||
/* Define performance standards */
|
/* Define performance standards */
|
||||||
static struct {
|
static struct {
|
||||||
@@ -26,7 +27,7 @@ static struct {
|
|||||||
} cstandard[3] =
|
} cstandard[3] =
|
||||||
{{4.00, 1.75}, /* CPE */
|
{{4.00, 1.75}, /* CPE */
|
||||||
{50, 43}, /* C(10) */
|
{50, 43}, /* C(10) */
|
||||||
{57,31} /* 常系数多项式计算 */
|
{57,31} /* <EFBFBD><EFBFBD>ϵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||||
};
|
};
|
||||||
|
|
||||||
int coeff_const[4];
|
int coeff_const[4];
|
||||||
@@ -82,7 +83,7 @@ static void init_const_poly(void)
|
|||||||
coeff_const[i] = rand_div+10;
|
coeff_const[i] = rand_div+10;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("你需要修改poly.c的const_poly_eval函数,实现下面的常数多项式计算!\n");
|
printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><EFBFBD>poly.c<EFBFBD><EFBFBD>const_poly_eval<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ij<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD>㣡\n");
|
||||||
printf("\tresult=%d+%d*x+%d*x^2+%d*x^3\n",coeff_const[0],coeff_const[1],coeff_const[2],coeff_const[3]);
|
printf("\tresult=%d+%d*x+%d*x^2+%d*x^3\n",coeff_const[0],coeff_const[1],coeff_const[2],coeff_const[3]);
|
||||||
|
|
||||||
fixval_const = ref_poly_eval(coeff_const, 3, xval);
|
fixval_const = ref_poly_eval(coeff_const, 3, xval);
|
||||||
@@ -97,15 +98,15 @@ void test_const_poly(void)
|
|||||||
int my_cal = const_poly_eval(coeff_const, 3, xval);
|
int my_cal = const_poly_eval(coeff_const, 3, xval);
|
||||||
if (fixval_const != my_cal)
|
if (fixval_const != my_cal)
|
||||||
{
|
{
|
||||||
printf("常系数多项式计算const_poly_eval实现错误(x=%d),预期结果是%d,但是计算得到的是%d\n",xval,fixval_const,my_cal);
|
printf("<EFBFBD><EFBFBD>ϵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>const_poly_evalʵ<EFBFBD>ִ<EFBFBD><EFBFBD><EFBFBD>x=%d<><64><EFBFBD><EFBFBD>Ԥ<EFBFBD>ڽ<EFBFBD><DABD><EFBFBD><EFBFBD>%d<><64><EFBFBD><EFBFBD><EFBFBD>Ǽ<EFBFBD><C7BC><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD>%d\n",xval,fixval_const,my_cal);
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
fix_time = 0;
|
fix_time = 0;
|
||||||
for (i=0;i<MAX_ITER_COUNT;i++)
|
for (i=0;i<MAX_ITER_COUNT;i++)
|
||||||
fix_time += measure_function(run_fun_const, 3);
|
fix_time += measure_function(run_fun_const, 3);
|
||||||
fix_time = fix_time / MAX_ITER_COUNT;
|
fix_time = fix_time / MAX_ITER_COUNT;
|
||||||
printf(" 常系数多项式计算时间 = %.1f\n", fix_time);
|
printf(" <EFBFBD><EFBFBD>ϵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD> = %.1f\n", fix_time);
|
||||||
printf(" 最高的常系数多项式计算得分 ============== %.0f\n",
|
printf(" <EFBFBD><EFBFBD>ߵij<EFBFBD>ϵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>÷<EFBFBD> ============== %.0f\n",
|
||||||
compute_score(fix_time, cstandard[2].cref, cstandard[2].cbest));
|
compute_score(fix_time, cstandard[2].cref, cstandard[2].cbest));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -132,7 +133,7 @@ int test_poly(peval_fun f, FILE *rpt) {
|
|||||||
ok = 0;
|
ok = 0;
|
||||||
if (rpt) {
|
if (rpt) {
|
||||||
fprintf(rpt,
|
fprintf(rpt,
|
||||||
"错误!多项式计算不对!阶=%d时,计算的值是%d,而正确值是%d\n",
|
"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD>㲻<EFBFBD>ԣ<EFBFBD><EFBFBD><EFBFBD>=%dʱ<64><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5>%d<><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȷֵ<C8B7><D6B5>%d\n",
|
||||||
MAXDEGREE-i, v, pval[i]);
|
MAXDEGREE-i, v, pval[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -142,7 +143,7 @@ int test_poly(peval_fun f, FILE *rpt) {
|
|||||||
ok = 0;
|
ok = 0;
|
||||||
if (rpt) {
|
if (rpt) {
|
||||||
fprintf(rpt,
|
fprintf(rpt,
|
||||||
"错误!多项式计算不对!阶=%d时,计算的值是%d,而正确值是%d\n",
|
"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD>㲻<EFBFBD>ԣ<EFBFBD><EFBFBD><EFBFBD>=%dʱ<64><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5>%d<><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȷֵ<C8B7><D6B5>%d\n",
|
||||||
FIXDEGREE, v, fixval);
|
FIXDEGREE, v, fixval);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -175,7 +176,7 @@ void run_poly(peval_fun f, char *descr, double *cpep, double *cfixp)
|
|||||||
double cpe=0;
|
double cpe=0;
|
||||||
double fix_time=0;
|
double fix_time=0;
|
||||||
pfun = f;
|
pfun = f;
|
||||||
printf("函数:%s\n", descr);
|
printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>%s\n", descr);
|
||||||
if (test_poly(f, stdout)) {
|
if (test_poly(f, stdout)) {
|
||||||
cpe = 0;
|
cpe = 0;
|
||||||
for (i=0;i<MAX_ITER_COUNT;i++)
|
for (i=0;i<MAX_ITER_COUNT;i++)
|
||||||
@@ -206,7 +207,7 @@ static double compute_score(double cmeas, double cref, double cbest)
|
|||||||
return 100*((smeas-1.0)/(sbest-1.0) + 0.1);
|
return 100*((smeas-1.0)/(sbest-1.0) + 0.1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 产生一个0~divv-1之间的随机数,同时更新随机数种子 */
|
/* <EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><EFBFBD>0~divv-1֮<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͬʱ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||||
void GenerateRandomNumber(unsigned long divv)
|
void GenerateRandomNumber(unsigned long divv)
|
||||||
{
|
{
|
||||||
unsigned long long x = rand1_h;
|
unsigned long long x = rand1_h;
|
||||||
@@ -230,18 +231,18 @@ int main(int argc, char *argv[])
|
|||||||
|
|
||||||
// CPU_Factor();
|
// CPU_Factor();
|
||||||
// GetCpuClock();
|
// GetCpuClock();
|
||||||
printf("\t2015多项式优化实验,欢迎你!\n");
|
printf("\t2015<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD>Ż<EFBFBD>ʵ<EFBFBD>飬<EFBFBD><EFBFBD>ӭ<EFBFBD>㣡\n");
|
||||||
printf("============================\n");
|
printf("============================\n");
|
||||||
|
|
||||||
if (argc == 1)
|
if (argc == 1)
|
||||||
{
|
{
|
||||||
printf("使用方法:%s 学号后6位 [学号后6位] [学号后6位] ...\n",argv[0]);
|
printf("ʹ<EFBFBD>÷<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>%s ѧ<>ź<EFBFBD>6λ [ѧ<>ź<EFBFBD>6λ] [ѧ<>ź<EFBFBD>6λ] ...\n",argv[0]);
|
||||||
printf("你需要依据提示改写poly.c程序,实现一个常系数多项式的计算,尽可能快哦....\n");
|
printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʾ<EFBFBD><EFBFBD>дpoly.c<><63><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5>һ<EFBFBD><D2BB><EFBFBD><EFBFBD>ϵ<EFBFBD><CFB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD>ļ<EFBFBD><C4BC>㣬<EFBFBD><E3A3AC><EFBFBD><EFBFBD><EFBFBD>ܿ<EFBFBD>Ŷ....\n");
|
||||||
printf("另外,你需要改写poly.c程序,实现任意阶的多项式计算和10阶的多项式计算,要快!\n");
|
printf("<EFBFBD><EFBFBD><EFBFBD>⣬<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><EFBFBD>дpoly.c<><63><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ķ<D7B5><C4B6><EFBFBD>ʽ<EFBFBD><CABD><EFBFBD><EFBFBD><EFBFBD>10<31>Ķ<D7B5><C4B6><EFBFBD>ʽ<EFBFBD><CABD><EFBFBD>㣬Ҫ<E3A3AC>죡\n");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*依据学号,初始化一个随机数发生器*/
|
/*<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ѧ<EFBFBD>ţ<EFBFBD><EFBFBD><EFBFBD>ʼ<EFBFBD><EFBFBD>һ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*/
|
||||||
rand1_h = (unsigned long)atoi(argv[1]);
|
rand1_h = (unsigned long)atoi(argv[1]);
|
||||||
rand1_l=0x29A;
|
rand1_l=0x29A;
|
||||||
GenerateRandomNumber(0);
|
GenerateRandomNumber(0);
|
||||||
@@ -266,10 +267,10 @@ int main(int argc, char *argv[])
|
|||||||
//make_CPU_busy();
|
//make_CPU_busy();
|
||||||
run_poly(peval_fun_tab[i].f, peval_fun_tab[i].descr, &cpe, &cfix);
|
run_poly(peval_fun_tab[i].f, peval_fun_tab[i].descr, &cpe, &cfix);
|
||||||
if (i == 0)
|
if (i == 0)
|
||||||
printf(" 最高的CPE得分 =========================== %.0f\n",
|
printf(" <EFBFBD><EFBFBD>ߵ<EFBFBD>CPE<EFBFBD>÷<EFBFBD> =========================== %.0f\n",
|
||||||
compute_score(cpe, cstandard[0].cref, cstandard[0].cbest));
|
compute_score(cpe, cstandard[0].cref, cstandard[0].cbest));
|
||||||
if (i == 1)
|
if (i == 1)
|
||||||
printf(" 最高的C(10)得分 ========================= %.0f\n",
|
printf(" <EFBFBD><EFBFBD>ߵ<EFBFBD>C(10)<EFBFBD>÷<EFBFBD> ========================= %.0f\n",
|
||||||
compute_score(cfix, cstandard[1].cref, cstandard[1].cbest));
|
compute_score(cfix, cstandard[1].cref, cstandard[1].cbest));
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|||||||
BIN
perflab/poly/poly_test.o
Normal file
BIN
perflab/poly/poly_test.o
Normal file
Binary file not shown.
Reference in New Issue
Block a user