matrix fixed

This commit is contained in:
2025-04-12 11:37:07 +08:00
parent b92e49bd71
commit ba21f80f3b
24 changed files with 1840 additions and 569 deletions

34
perflab/matrix/Makefile Normal file
View File

@@ -0,0 +1,34 @@
CC = gcc
CFLAGS = -Wall -O1 -g
#LDFLAGS = -lm -lcudart -lcuda
# Source files
SRCS = rowcol_test.c clock.c cpe.c fcyc.c lsquare.c rowcol_202302723005.c
#CUDA_SRCS = rowcol.cu
OBJS = $(SRCS:.c=.o)
#rowcol.o
# Target executable
TARGET = matrix_test
# Default target
all: $(TARGET)
# Rule to build the executable
$(TARGET): $(OBJS)
$(CC) $(OBJS) -o $(TARGET) $(LDFLAGS)
# Rule to build object files
%.o: %.c
$(CC) $(CFLAGS) -c $< -o $@
# Rule to build CUDA object files
#rowcol.o: rowcol.cu
# $(NVCC) $(CUDA_FLAGS) -c $< -o $@
# Clean rule
clean:
rm -f $(OBJS) $(TARGET)
# Phony targets
.PHONY: all clean

View File

@@ -1,229 +1,196 @@
/* clock.c /* clock.c
* Retrofitted to use thread-specific timers * Retrofitted to use thread-specific timers
* and to get clock information from /proc/cpuinfo * and to get clock information from /proc/cpuinfo
* (C) R. E. Bryant, 2010 * (C) R. E. Bryant, 2010
* * Modified for cross-platform compatibility
*/ */
/* When this constant is not defined, uses time stamp counter */ #define _GNU_SOURCE // For sched_setaffinity on Linux
#define USE_POSIX 0 #include <stdint.h>
#include <stdio.h>
/* Choice to use cpu_gettime call or Intel time stamp counter directly */ #include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h> #ifdef _WIN32
#include <string.h> #include <intrin.h>
#include <intrin.h> #include <windows.h>
//#include <intrinsics.h> #else
#include <windows.h> #include <sched.h>
#include <time.h> #include <time.h>
#include "clock.h" #include <unistd.h>
#include <x86intrin.h>
/* Use x86 cycle counter */ typedef struct {
uint64_t QuadPart;
/* Initialize the cycle counter */ } LARGE_INTEGER;
static unsigned cyc_hi = 0; typedef void *HANDLE;
static unsigned cyc_lo = 0; #define __int64 long long
#define Sleep(ms) usleep((ms) * 1000)
/* Set *hi and *lo to the high and low order bits of the cycle counter. #endif
Implementation requires assembly code to use the rdtsc instruction. */
void access_counter(unsigned *hi, unsigned *lo) #include "clock.h"
{
/* Use x86 cycle counter */
long long counter; static unsigned cyc_hi = 0;
static unsigned cyc_lo = 0;
counter = __rdtsc();
(*hi) = (unsigned int)(counter >> 32); void access_counter(unsigned *hi, unsigned *lo) {
(*lo) = (unsigned int)counter; uint64_t counter = __rdtsc();
/* *hi = (unsigned)(counter >> 32);
*lo = (unsigned)counter;
LARGE_INTEGER lPerformanceCount; }
QueryPerformanceCounter(&lPerformanceCount); void start_counter() { access_counter(&cyc_hi, &cyc_lo); }
(*hi) = (unsigned int)lPerformanceCount.HighPart;
(*lo) = (unsigned int)lPerformanceCount.LowPart; double get_counter() {
// printf("%08X %08X\n",(*hi),(*lo)); unsigned ncyc_hi, ncyc_lo;
*/ access_counter(&ncyc_hi, &ncyc_lo);
} uint64_t start = ((uint64_t)cyc_hi << 32) | cyc_lo;
uint64_t end = ((uint64_t)ncyc_hi << 32) | ncyc_lo;
return (double)(end - start);
/* Record the current value of the cycle counter. */ }
void start_counter()
{ void make_CPU_busy(void) {
access_counter(&cyc_hi, &cyc_lo); volatile double old_tick = get_counter();
} volatile double new_tick;
while ((new_tick - old_tick) < 1000000000) {
/* Return the number of cycles since the last call to start_counter. */ new_tick = get_counter();
double get_counter() }
{ }
unsigned ncyc_hi, ncyc_lo;
unsigned hi, lo, borrow; #ifdef _WIN32
double result; #define GET_TIME(dest) QueryPerformanceCounter(dest)
#else
/* Get cycle counter */ static inline void GET_TIME(LARGE_INTEGER *dest) {
access_counter(&ncyc_hi, &ncyc_lo); struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
/* Do double precision subtraction */ dest->QuadPart = (uint64_t)ts.tv_sec * 1000000000 + ts.tv_nsec;
lo = ncyc_lo - cyc_lo; }
borrow = cyc_lo > ncyc_lo; #define QueryPerformanceFrequency(freq) ((freq)->QuadPart = 1000000000)
hi = ncyc_hi - cyc_hi - borrow; #endif
result = (double) hi * (1 << 30) * 4 + lo;
return result; double mhz(int verbose) {
} LARGE_INTEGER lFrequency;
void make_CPU_busy(void) LARGE_INTEGER lPerformanceCount_Start;
{ LARGE_INTEGER lPerformanceCount_End;
volatile double old_tick,new_tick; double mhz;
start_counter(); double fTime;
old_tick = get_counter(); __int64 _i64StartCpuCounter;
new_tick = get_counter(); __int64 _i64EndCpuCounter;
while (new_tick - old_tick < 1000000000)
new_tick = get_counter(); #ifdef _WIN32
} HANDLE hThread = GetCurrentThread();
SetThreadAffinityMask(hThread, 0x1);
//CPU的频率 #else
double mhz(int verbose) cpu_set_t cpuset;
{ CPU_ZERO(&cpuset);
LARGE_INTEGER lFrequency; CPU_SET(0, &cpuset);
LARGE_INTEGER lPerformanceCount_Start; sched_setaffinity(0, sizeof(cpuset), &cpuset);
LARGE_INTEGER lPerformanceCount_End; #endif
double mhz;
double fTime; QueryPerformanceFrequency(&lFrequency);
__int64 _i64StartCpuCounter; GET_TIME(&lPerformanceCount_Start);
__int64 _i64EndCpuCounter; _i64StartCpuCounter = __rdtsc();
//On a multiprocessor machine, it should not matter which processor is called. Sleep(200);
//However, you can get different results on different processors due to bugs in GET_TIME(&lPerformanceCount_End);
//the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function. _i64EndCpuCounter = __rdtsc();
HANDLE hThread=GetCurrentThread();
SetThreadAffinityMask(hThread,0x1); fTime = (lPerformanceCount_End.QuadPart - lPerformanceCount_Start.QuadPart) /
(double)lFrequency.QuadPart;
//主板上高精度定时器的晶振频率 mhz = (_i64EndCpuCounter - _i64StartCpuCounter) / (fTime * 1000000.0);
//这个定时器应该就是一片8253或者8254
//在intel ich7中集成了8254 if (verbose > 0) {
QueryPerformanceFrequency(&lFrequency); printf("CPU频率为: %.6fMHz.\n", mhz);
// if (verbose>0) }
// printf("高精度定时器的晶振频率:%1.0fHz.\n",(double)lFrequency.QuadPart); return mhz;
}
//这个定时器每经过一个时钟周期,其计数器会+1
QueryPerformanceCounter(&lPerformanceCount_Start); double CPU_Factor1(void) {
double result;
//RDTSC指令:获取CPU经历的时钟周期数 int i, j, k;
_i64StartCpuCounter=__rdtsc(); LARGE_INTEGER lStart, lEnd;
LARGE_INTEGER lFrequency;
//延时长一点,误差会小一点 double fTime;
//int nTemp=100000;
//while (--nTemp); #ifdef _WIN32
Sleep(200); HANDLE hThread = GetCurrentThread();
SetThreadAffinityMask(hThread, 0x1);
QueryPerformanceCounter(&lPerformanceCount_End); #else
cpu_set_t cpuset;
_i64EndCpuCounter=__rdtsc(); CPU_ZERO(&cpuset);
CPU_SET(0, &cpuset);
//f=1/T => f=计数次数/(计数次数*T) sched_setaffinity(0, sizeof(cpuset), &cpuset);
//这里的“计数次数*T”就是时间差 #endif
fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart)
/(double)lFrequency.QuadPart; QueryPerformanceFrequency(&lFrequency);
GET_TIME(&lStart);
mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0); start_counter();
if (verbose>0)
printf("CPU频率为:%1.6fMHz.\n",mhz); for (i = 0; i < 100; i++)
return mhz; for (j = 0; j < 1000; j++)
} for (k = 0; k < 1000; k++)
;
double CPU_Factor1(void)
{ result = get_counter();
double result; GET_TIME(&lEnd);
int i,j,k,ii,jj,kk;
LARGE_INTEGER lStart,lEnd; fTime = (lEnd.QuadPart - lStart.QuadPart) / (double)lFrequency.QuadPart;
LARGE_INTEGER lFrequency; printf("CPU计算时长为: %f", result);
HANDLE hThread; printf("\t %f\n", fTime);
double fTime; return result;
}
QueryPerformanceFrequency(&lFrequency);
double CPU_Factor(void) {
ii = 43273; double frequency;
kk = 1238; double multiplier = 1000 * 1000 * 1000; // nano
result = 1; LARGE_INTEGER lFrequency;
jj = 1244; LARGE_INTEGER start, stop;
int i;
hThread=GetCurrentThread(); const int known_instructions_per_loop = 27317;
SetThreadAffinityMask(hThread,0x1); int iterations = 100000000;
QueryPerformanceCounter(&lStart); int g = 0;
//_asm("cpuid"); double normal_ticks_per_second;
start_counter(); double ticks;
for (i=0;i<100;i++) double time;
for (j=0;j<1000;j++) double loops_per_sec;
for (k=0;k<1000;k++) double instructions_per_loop;
kk += kk*ii+jj; double ratio;
double actual_freq;
result = get_counter();
QueryPerformanceCounter(&lEnd); #ifdef _WIN32
fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart); HANDLE hThread = GetCurrentThread();
printf("CPU运行时间为%f",result); SetThreadAffinityMask(hThread, 0x1);
printf("\t %f\n",fTime); #else
return result; cpu_set_t cpuset;
} CPU_ZERO(&cpuset);
CPU_SET(0, &cpuset);
double CPU_Factor(void) sched_setaffinity(0, sizeof(cpuset), &cpuset);
{ #endif
double frequency;
double multiplier = 1000 * 1000 * 1000;//nano QueryPerformanceFrequency(&lFrequency);
LARGE_INTEGER lFrequency; frequency = (double)lFrequency.QuadPart;
LARGE_INTEGER start,stop; GET_TIME(&start);
HANDLE hThread;
int i; for (i = 0; i < iterations; i++) {
const int gigahertz= 1000*1000*1000; g++;
const int known_instructions_per_loop = 27317; g++;
g++;
int iterations = 100000000; g++;
int g = 0; }
double normal_ticks_per_second;
double ticks; GET_TIME(&stop);
double time; normal_ticks_per_second = frequency * 1000;
double loops_per_sec; ticks = (double)(stop.QuadPart - start.QuadPart);
double instructions_per_loop; time = (ticks * multiplier) / frequency;
double ratio; loops_per_sec = iterations / (time / multiplier);
double actual_freq; instructions_per_loop = normal_ticks_per_second / loops_per_sec;
ratio = instructions_per_loop / known_instructions_per_loop;
QueryPerformanceFrequency(&lFrequency); actual_freq = normal_ticks_per_second / ratio;
frequency = (double)lFrequency.QuadPart;
printf("Perf counter freq: %f\n", normal_ticks_per_second);
hThread=GetCurrentThread(); printf("Loops per sec: %f\n", loops_per_sec);
SetThreadAffinityMask(hThread,0x1); printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
QueryPerformanceCounter(&start); printf("Presumed freq: %f\n", actual_freq);
for( i = 0; i < iterations; i++) printf("ratio: %f\n", ratio);
{ printf("time=%f\n", time);
g++; return ratio;
g++; }
g++;
g++;
}
QueryPerformanceCounter(&stop);
//normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199
normal_ticks_per_second = frequency * 1000;
ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
time = (ticks * multiplier) /frequency;
loops_per_sec = iterations / (time/multiplier);
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
ratio = (instructions_per_loop / known_instructions_per_loop);
actual_freq = normal_ticks_per_second / ratio;
/*
actual_freq = normal_ticks_per_second / ratio;
actual_freq = known_instructions_per_loop*iterations*multiplier/time;
2293 = x/time;
2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
loops_per_sec = iterations*frequency / ticks
instructions_per_loop = / loops_per_sec;
*/
printf("Perf counter freq: %f\n", normal_ticks_per_second);
printf("Loops per sec: %f\n", loops_per_sec);
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
printf("Presumed freq: %f\n", actual_freq);
printf("ratio: %f\n", ratio);
printf("time=%f\n",time);
return ratio;
}

229
perflab/matrix/clock.c.bak Normal file
View File

@@ -0,0 +1,229 @@
/* clock.c
* Retrofitted to use thread-specific timers
* and to get clock information from /proc/cpuinfo
* (C) R. E. Bryant, 2010
*
*/
/* When this constant is not defined, uses time stamp counter */
#define USE_POSIX 0
/* Choice to use cpu_gettime call or Intel time stamp counter directly */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <x86intrin.h>
//#include <intrinsics.h>
//#include <windows.h>
#include <time.h>
#include "clock.h"
/* Use x86 cycle counter */
/* Initialize the cycle counter */
static unsigned cyc_hi = 0;
static unsigned cyc_lo = 0;
/* Set *hi and *lo to the high and low order bits of the cycle counter.
Implementation requires assembly code to use the rdtsc instruction. */
void access_counter(unsigned *hi, unsigned *lo)
{
long long counter;
counter = __rdtsc();
(*hi) = (unsigned int)(counter >> 32);
(*lo) = (unsigned int)counter;
/*
LARGE_INTEGER lPerformanceCount;
QueryPerformanceCounter(&lPerformanceCount);
(*hi) = (unsigned int)lPerformanceCount.HighPart;
(*lo) = (unsigned int)lPerformanceCount.LowPart;
// printf("%08X %08X\n",(*hi),(*lo));
*/
}
/* Record the current value of the cycle counter. */
void start_counter()
{
access_counter(&cyc_hi, &cyc_lo);
}
/* Return the number of cycles since the last call to start_counter. */
double get_counter()
{
unsigned ncyc_hi, ncyc_lo;
unsigned hi, lo, borrow;
double result;
/* Get cycle counter */
access_counter(&ncyc_hi, &ncyc_lo);
/* Do double precision subtraction */
lo = ncyc_lo - cyc_lo;
borrow = cyc_lo > ncyc_lo;
hi = ncyc_hi - cyc_hi - borrow;
result = (double) hi * (1 << 30) * 4 + lo;
return result;
}
void make_CPU_busy(void)
{
volatile double old_tick,new_tick;
start_counter();
old_tick = get_counter();
new_tick = get_counter();
while (new_tick - old_tick < 1000000000)
new_tick = get_counter();
}
//CPU的频率
double mhz(int verbose)
{
LARGE_INTEGER lFrequency;
LARGE_INTEGER lPerformanceCount_Start;
LARGE_INTEGER lPerformanceCount_End;
double mhz;
double fTime;
__int64 _i64StartCpuCounter;
__int64 _i64EndCpuCounter;
//On a multiprocessor machine, it should not matter which processor is called.
//However, you can get different results on different processors due to bugs in
//the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function.
HANDLE hThread=GetCurrentThread();
SetThreadAffinityMask(hThread,0x1);
//主板上高精度定时器的晶振频率
//这个定时器应该就是一片8253或者8254
//在intel ich7中集成了8254
QueryPerformanceFrequency(&lFrequency);
// if (verbose>0)
// printf("高精度定时器的晶振频率:%1.0fHz.\n",(double)lFrequency.QuadPart);
//这个定时器每经过一个时钟周期,其计数器会+1
QueryPerformanceCounter(&lPerformanceCount_Start);
//RDTSC指令:获取CPU经历的时钟周期数
_i64StartCpuCounter=__rdtsc();
//延时长一点,误差会小一点
//int nTemp=100000;
//while (--nTemp);
Sleep(200);
QueryPerformanceCounter(&lPerformanceCount_End);
_i64EndCpuCounter=__rdtsc();
//f=1/T => f=计数次数/(计数次数*T)
//这里的“计数次数*T”就是时间差
fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart)
/(double)lFrequency.QuadPart;
mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0);
if (verbose>0)
printf("CPU频率为:%1.6fMHz.\n",mhz);
return mhz;
}
double CPU_Factor1(void)
{
double result;
int i,j,k,ii,jj,kk;
LARGE_INTEGER lStart,lEnd;
LARGE_INTEGER lFrequency;
HANDLE hThread;
double fTime;
QueryPerformanceFrequency(&lFrequency);
ii = 43273;
kk = 1238;
result = 1;
jj = 1244;
hThread=GetCurrentThread();
SetThreadAffinityMask(hThread,0x1);
QueryPerformanceCounter(&lStart);
//_asm("cpuid");
start_counter();
for (i=0;i<100;i++)
for (j=0;j<1000;j++)
for (k=0;k<1000;k++)
kk += kk*ii+jj;
result = get_counter();
QueryPerformanceCounter(&lEnd);
fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart);
printf("CPU运行时间为%f",result);
printf("\t %f\n",fTime);
return result;
}
double CPU_Factor(void)
{
double frequency;
double multiplier = 1000 * 1000 * 1000;//nano
LARGE_INTEGER lFrequency;
LARGE_INTEGER start,stop;
HANDLE hThread;
int i;
const int gigahertz= 1000*1000*1000;
const int known_instructions_per_loop = 27317;
int iterations = 100000000;
int g = 0;
double normal_ticks_per_second;
double ticks;
double time;
double loops_per_sec;
double instructions_per_loop;
double ratio;
double actual_freq;
QueryPerformanceFrequency(&lFrequency);
frequency = (double)lFrequency.QuadPart;
hThread=GetCurrentThread();
SetThreadAffinityMask(hThread,0x1);
QueryPerformanceCounter(&start);
for( i = 0; i < iterations; i++)
{
g++;
g++;
g++;
g++;
}
QueryPerformanceCounter(&stop);
//normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199
normal_ticks_per_second = frequency * 1000;
ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
time = (ticks * multiplier) /frequency;
loops_per_sec = iterations / (time/multiplier);
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
ratio = (instructions_per_loop / known_instructions_per_loop);
actual_freq = normal_ticks_per_second / ratio;
/*
actual_freq = normal_ticks_per_second / ratio;
actual_freq = known_instructions_per_loop*iterations*multiplier/time;
2293 = x/time;
2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
loops_per_sec = iterations*frequency / ticks
instructions_per_loop = / loops_per_sec;
*/
printf("Perf counter freq: %f\n", normal_ticks_per_second);
printf("Loops per sec: %f\n", loops_per_sec);
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
printf("Presumed freq: %f\n", actual_freq);
printf("ratio: %f\n", ratio);
printf("time=%f\n",time);
return ratio;
}

BIN
perflab/matrix/clock.o Normal file

Binary file not shown.

BIN
perflab/matrix/cpe.o Normal file

Binary file not shown.

View File

@@ -119,7 +119,7 @@ double fcyc(test_funct f, int *params)
if (clear_cache) if (clear_cache)
clear(); clear();
start_counter(); start_counter();
f(params); f((long*)params);
cyc = get_counter(); cyc = get_counter();
if (cyc > 0.0) if (cyc > 0.0)
add_sample(cyc); add_sample(cyc);
@@ -131,7 +131,7 @@ double fcyc(test_funct f, int *params)
clear(); clear();
start_counter(); start_counter();
for (i=0;i<MAX_ITER_TIMES;i++) for (i=0;i<MAX_ITER_TIMES;i++)
f(params); f((long*)params);
cyc = get_counter()/MAX_ITER_TIMES; cyc = get_counter()/MAX_ITER_TIMES;
if (cyc > 0.0) if (cyc > 0.0)
add_sample(cyc); add_sample(cyc);

BIN
perflab/matrix/fcyc.o Normal file

Binary file not shown.

BIN
perflab/matrix/lsquare.o Normal file

Binary file not shown.

BIN
perflab/matrix/matrix_test Normal file

Binary file not shown.

View File

@@ -1,77 +1,69 @@
/************************************************************************** /**************************************************************************
行/列求和函数。按下面的要求编辑此文件: ??/???????????????????????????????
1. 将你的学号、姓名,以注释的方式写到下面; 1. ???????????????????????????????
2. 实现不同版本的行列求和函数; 2. ??????????????????????
3. 编辑rc_fun_rec rc_fun_tab数组,将你的最好的答案 3. ??rc_fun_rec rc_fun_tab??????????????????
(最好的行和列求和、最好的列求和)作为数组的前两项 ???????????????????????????????????????????
***************************************************************************/ ***************************************************************************/
/* /*
学号:201209054233 ????201209054233
姓名:夜半加班狂 ??????????????
*/ */
#include "rowcol.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdio.h> /* ????????????????? */
#include <stdlib.h> /* ???????????????????????????????????????????????
#include "rowcol.h" ??????????2?????????????????
#include <math.h>
/* 参考的列求和函数实现 */
/* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是
一样的只是第2个参数不会用到而已
*/ */
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
{ int i, j;
int i,j; for (j = 0; j < N; j++) {
colsum[j] = 0;
for (i = 0; i < N; i++)
colsum[j] += M[i][j];
}
}
/* ???????????????????? */
/* ??????????????????????? */
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
int i, j;
for (i = 0; i < N; i++) {
rowsum[i] = colsum[i] = 0;
for (j = 0; j < N; j++) { for (j = 0; j < N; j++) {
colsum[j] = 0; rowsum[i] += M[i][j];
for (i = 0; i < N; i++) colsum[i] += M[j][i];
colsum[j] += M[i][j];
} }
}
} }
/*
/* 参考的列和行求和函数实现 */ ????????????????????????????????????????, COL/ROWCOL, "?????????"??
/* 计算矩阵中的每一行、每一列的和。 */ COL??????????????????????
ROWCOL???????????????????????
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) ?????????????????????????????
{ ????
int i,j; {my_c_sum1, "?????????????????"},
for (i = 0; i < N; i++) { {my_rc_sum2, "??????????????????"},
rowsum[i] = colsum[i] = 0;
for (j = 0; j < N; j++) {
rowsum[i] += M[i][j];
colsum[i] += M[j][i];
}
}
}
/*
这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串"
COL表示该函数仅仅计算每一列的和
ROWCOL表示该函数计算每一行、每一列的和
将你认为最好的两个实现,放在最前面。
比如:
{my_c_sum1, "超级垃圾列求和实现"},
{my_rc_sum2, "好一点的行列求和实现"},
*/ */
rc_fun_rec rc_fun_tab[] = rc_fun_rec rc_fun_tab[] = {
{
/* 第一项,应当是你写的最好列求和的函数实现 */ /* ???????????????????????????????? */
{c_sum, COL, "Best column sum"}, {c_sum, COL, "Best column sum"},
/* 第二项,应当是你写的最好行列求和的函数实现 */ /* ?????????????????????????????????? */
{rc_sum, ROWCOL, "Best row and column sum"}, {rc_sum, ROWCOL, "Best row and column sum"},
{c_sum, COL, "Column sum, reference implementation"}, {c_sum, COL, "Column sum, reference implementation"},
{rc_sum, ROWCOL, "Row and column sum, reference implementation"}, {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
/* 下面的代码不能修改或者删除!!表明数组列表结束 */ /* ??????????????????????????????????????? */
{NULL,ROWCOL,NULL} {NULL, ROWCOL, NULL}};
};

162
perflab/matrix/rowcol.c~ Normal file
View File

@@ -0,0 +1,162 @@
/**************************************************************************
行/列求和函数。按下面的要求编辑此文件:
1. 将你的学号、姓名,以注释的方式写到下面;
2. 实现不同版本的行列求和函数;
3. 编辑rc_fun_rec rc_fun_tab数组将你的最好的答案
(最好的行和列求和、最好的列求和)作为数组的前两项
***************************************************************************/
/*
学号202302723005
姓名:程景愉
*/
#include <stdio.h>
#include <stdlib.h>
#include "rowcol.h"
#include <math.h>
#include <cuda_runtime.h>
/* 参考的列求和函数实现 */
/* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是
一样的只是第2个参数不会用到而已
*/
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
{
int i,j;
for (j = 0; j < N; j++) {
colsum[j] = 0;
for (i = 0; i < N; i++)
colsum[j] += M[i][j];
}
}
/* 参考的列和行求和函数实现 */
/* 计算矩阵中的每一行、每一列的和。 */
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
{
int i,j;
for (i = 0; i < N; i++) {
rowsum[i] = colsum[i] = 0;
for (j = 0; j < N; j++) {
rowsum[i] += M[i][j];
colsum[i] += M[j][i];
}
}
}
/* CUDA优化的列求和函数 */
void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
{
// 分配设备内存
int *d_M, *d_colsum;
cudaMalloc(&d_M, N * N * sizeof(int));
cudaMalloc(&d_colsum, N * sizeof(int));
// 将数据从主机复制到设备
cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
// 定义CUDA核函数
dim3 blockDim(256);
dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
// 启动核函数
cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
// 将结果从设备复制回主机
cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
// 释放设备内存
cudaFree(d_M);
cudaFree(d_colsum);
}
/* CUDA优化的行列求和函数 */
void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
{
// 分配设备内存
int *d_M, *d_rowsum, *d_colsum;
cudaMalloc(&d_M, N * N * sizeof(int));
cudaMalloc(&d_rowsum, N * sizeof(int));
cudaMalloc(&d_colsum, N * sizeof(int));
// 将数据从主机复制到设备
cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
// 定义CUDA核函数
dim3 blockDim(256);
dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
// 启动核函数
cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
// 将结果从设备复制回主机
cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
// 释放设备内存
cudaFree(d_M);
cudaFree(d_rowsum);
cudaFree(d_colsum);
}
/* CUDA核函数 - 列求和 */
__global__ void cudaColumnSum(int *M, int *colsum)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (col < N) {
colsum[col] = 0;
for (int row = 0; row < N; row++) {
colsum[col] += M[row * N + col];
}
}
}
/* CUDA核函数 - 行列求和 */
__global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
// 计算行和
rowsum[idx] = 0;
for (int j = 0; j < N; j++) {
rowsum[idx] += M[idx * N + j];
}
// 计算列和
colsum[idx] = 0;
for (int i = 0; i < N; i++) {
colsum[idx] += M[i * N + idx];
}
}
}
/*
这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串"
COL表示该函数仅仅计算每一列的和
ROWCOL表示该函数计算每一行、每一列的和
将你认为最好的两个实现,放在最前面。
比如:
{my_c_sum1, "超级垃圾列求和实现"},
{my_rc_sum2, "好一点的行列求和实现"},
*/
rc_fun_rec rc_fun_tab[] =
{
/* 第一项,应当是你写的最好列求和的函数实现 */
{cuda_c_sum, COL, "CUDA optimized column sum"},
/* 第二项,应当是你写的最好行列求和的函数实现 */
{cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
{c_sum, COL, "Column sum, reference implementation"},
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
/* 下面的代码不能修改或者删除!!表明数组列表结束 */
{NULL,ROWCOL,NULL}
};

BIN
perflab/matrix/rowcol.o Normal file

Binary file not shown.

240
perflab/matrix/rowcol.y~ Normal file
View File

@@ -0,0 +1,240 @@
/**************************************************************************
靠/靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>
1. 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>
2. 靠靠靠靠靠靠靠靠靠靠靠
3. 靠rc_fun_rec rc_fun_tab靠靠靠靠靠靠旷靠靠
靠靠旷靠靠靠靠靠靠靠蹩靠靠靠靠靠靠靠靠靠靠<E99DA0>
***************************************************************************/
/*
靠靠201209054233
靠靠靠靠靠靠靠
*/
#include <stdio.h>
#include <stdlib.h>
#include "rowcol.h"
#include <math.h>
/* 靠靠靠靠靠靠靠靠<E99DA0> */
/* 靠靠靠靠靠<E99DA0><E99DA0>靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠蚩靠靠
靠靠靠靠靠2靠靠靠靠靠旷靠靠<E99DA0>
*/
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
{
int i,j;
for (j = 0; j < N; j++) {
colsum[j] = 0;
for (i = 0; i < N; i++)
colsum[j] += M[i][j];
}
}
/* 靠靠靠靠靠靠靠靠靠靠 */
/* 靠靠靠靠靠<E99DA0><E99DA0>靠靠<E99DA0>靠靠靠 */
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
{
int i,j;
for (i = 0; i < N; i++) {
rowsum[i] = colsum[i] = 0;
for (j = 0; j < N; j++) {
rowsum[i] += M[i][j];
colsum[i] += M[j][i];
}
}
}
/*
靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>靠靠靠靠靠靠靠<E99DA0>, COL/ROWCOL, "靠靠靠靠<E99DA0>"靠
COL靠靠<E99DA0>靠靠靠靠靠<E99DA0>靠靠<E99DA0>
ROWCOL靠靠<E99DA0>靠靠靠<E99DA0>靠靠<E99DA0>靠靠<E99DA0>
靠靠靠靠靠蹩靠靠靠靠靠靠靠靠<E99DA0>
靠靠
{my_c_sum1, "靠靠靠靠靠靠靠靠<E99DA0>"},
{my_rc_sum2, "靠靠靠靠靠靠靠靠靠"},
*/
rc_fun_rec rc_fun_tab[] =
{
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */
{c_sum, COL, "Best column sum"},
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */
{rc_sum, ROWCOL, "Best row and column sum"},
{c_sum, COL, "Column sum, reference implementation"},
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0> */
{NULL,ROWCOL,NULL}
};
// /**************************************************************************
// 行/列求和函数。按下面的要求编辑此文件:
// 1. 将你的学号、姓名,以注释的方式写到下面;
// 2. 实现不同版本的行列求和函数;
// 3. 编辑rc_fun_rec rc_fun_tab数组将你的最好的答案
// (最好的行和列求和、最好的列求和)作为数组的前两项
// ***************************************************************************/
//
// /*
// 学号202302723005
// 姓名:程景愉
// */
//
//
// #include <stdio.h>
// #include <stdlib.h>
// #include "rowcol.h"
// #include <math.h>
// #include <cuda_runtime.h>
//
// /* 参考的列求和函数实现 */
// /* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是
// 一样的只是第2个参数不会用到而已
// */
//
// void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
// {
// int i,j;
// for (j = 0; j < N; j++) {
// colsum[j] = 0;
// for (i = 0; i < N; i++)
// colsum[j] += M[i][j];
// }
// }
//
//
// /* 参考的列和行求和函数实现 */
// /* 计算矩阵中的每一行、每一列的和。 */
//
// void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
// {
// int i,j;
// for (i = 0; i < N; i++) {
// rowsum[i] = colsum[i] = 0;
// for (j = 0; j < N; j++) {
// rowsum[i] += M[i][j];
// colsum[i] += M[j][i];
// }
// }
// }
//
// /* CUDA优化的列求和函数 */
// void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
// {
// // 分配设备内存
// int *d_M, *d_colsum;
// cudaMalloc(&d_M, N * N * sizeof(int));
// cudaMalloc(&d_colsum, N * sizeof(int));
//
// // 将数据从主机复制到设备
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
//
// // 定义CUDA核函数
// dim3 blockDim(256);
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
//
// // 启动核函数
// cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
//
// // 将结果从设备复制回主机
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
//
// // 释放设备内存
// cudaFree(d_M);
// cudaFree(d_colsum);
// }
//
// /* CUDA优化的行列求和函数 */
// void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
// {
// // 分配设备内存
// int *d_M, *d_rowsum, *d_colsum;
// cudaMalloc(&d_M, N * N * sizeof(int));
// cudaMalloc(&d_rowsum, N * sizeof(int));
// cudaMalloc(&d_colsum, N * sizeof(int));
//
// // 将数据从主机复制到设备
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
//
// // 定义CUDA核函数
// dim3 blockDim(256);
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
//
// // 启动核函数
// cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
//
// // 将结果从设备复制回主机
// cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
//
// // 释放设备内存
// cudaFree(d_M);
// cudaFree(d_rowsum);
// cudaFree(d_colsum);
// }
//
// /* CUDA核函数 - 列求和 */
// __global__ void cudaColumnSum(int *M, int *colsum)
// {
// int col = blockIdx.x * blockDim.x + threadIdx.x;
// if (col < N) {
// colsum[col] = 0;
// for (int row = 0; row < N; row++) {
// colsum[col] += M[row * N + col];
// }
// }
// }
//
// /* CUDA核函数 - 行列求和 */
// __global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
// {
// int idx = blockIdx.x * blockDim.x + threadIdx.x;
// if (idx < N) {
// // 计算行和
// rowsum[idx] = 0;
// for (int j = 0; j < N; j++) {
// rowsum[idx] += M[idx * N + j];
// }
//
// // 计算列和
// colsum[idx] = 0;
// for (int i = 0; i < N; i++) {
// colsum[idx] += M[i * N + idx];
// }
// }
// }
//
// /*
// 这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串"
// COL表示该函数仅仅计算每一列的和
// ROWCOL表示该函数计算每一行、每一列的和
// 将你认为最好的两个实现,放在最前面。
// 比如:
// {my_c_sum1, "超级垃圾列求和实现"},
// {my_rc_sum2, "好一点的行列求和实现"},
// */
//
// rc_fun_rec rc_fun_tab[] =
// {
//
// /* 第一项,应当是你写的最好列求和的函数实现 */
// {cuda_c_sum, COL, "CUDA optimized column sum"},
// /* 第二项,应当是你写的最好行列求和的函数实现 */
// {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
//
// {c_sum, COL, "Column sum, reference implementation"},
//
// {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
//
// /* 下面的代码不能修改或者删除!!表明数组列表结束 */
// {NULL,ROWCOL,NULL}
// };

240
perflab/matrix/rowcol.z~ Normal file
View File

@@ -0,0 +1,240 @@
/**************************************************************************
靠/靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>
1. 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>
2. 靠靠靠靠靠靠靠靠靠靠靠
3. 靠rc_fun_rec rc_fun_tab靠靠靠靠靠靠旷靠靠
靠靠旷靠靠靠靠靠靠靠蹩靠靠靠靠靠靠靠靠靠靠<E99DA0>
***************************************************************************/
/*
靠靠201209054233
靠靠靠靠靠靠靠
*/
#include <stdio.h>
#include <stdlib.h>
#include "rowcol.h"
#include <math.h>
/* 靠靠靠靠靠靠靠靠<E99DA0> */
/* 靠靠靠靠靠<E99DA0><E99DA0>靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠蚩靠靠
靠靠靠靠靠2靠靠靠靠靠旷靠靠<E99DA0>
*/
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
{
int i,j;
for (j = 0; j < N; j++) {
colsum[j] = 0;
for (i = 0; i < N; i++)
colsum[j] += M[i][j];
}
}
/* 靠靠靠靠靠靠靠靠靠靠 */
/* 靠靠靠靠靠<E99DA0><E99DA0>靠靠<E99DA0>靠靠靠 */
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
{
int i,j;
for (i = 0; i < N; i++) {
rowsum[i] = colsum[i] = 0;
for (j = 0; j < N; j++) {
rowsum[i] += M[i][j];
colsum[i] += M[j][i];
}
}
}
/*
靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0>靠靠靠靠靠靠靠<E99DA0>, COL/ROWCOL, "靠靠靠靠<E99DA0>"靠
COL靠靠<E99DA0>靠靠靠靠靠<E99DA0>靠靠<E99DA0>
ROWCOL靠靠<E99DA0>靠靠靠<E99DA0>靠靠<E99DA0>靠靠<E99DA0>
靠靠靠靠靠蹩靠靠靠靠靠靠靠靠<E99DA0>
靠靠
{my_c_sum1, "靠靠靠靠靠靠靠靠<E99DA0>"},
{my_rc_sum2, "靠靠靠靠靠靠靠靠靠"},
*/
rc_fun_rec rc_fun_tab[] =
{
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */
{c_sum, COL, "Best column sum"},
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */
{rc_sum, ROWCOL, "Best row and column sum"},
{c_sum, COL, "Column sum, reference implementation"},
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠<E99DA0> */
{NULL,ROWCOL,NULL}
};
// /**************************************************************************
// 行/列求和函数。按下面的要求编辑此文件:
// 1. 将你的学号、姓名,以注释的方式写到下面;
// 2. 实现不同版本的行列求和函数;
// 3. 编辑rc_fun_rec rc_fun_tab数组将你的最好的答案
// (最好的行和列求和、最好的列求和)作为数组的前两项
// ***************************************************************************/
//
// /*
// 学号202302723005
// 姓名:程景愉
// */
//
//
// #include <stdio.h>
// #include <stdlib.h>
// #include "rowcol.h"
// #include <math.h>
// #include <cuda_runtime.h>
//
// /* 参考的列求和函数实现 */
// /* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是
// 一样的只是第2个参数不会用到而已
// */
//
// void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
// {
// int i,j;
// for (j = 0; j < N; j++) {
// colsum[j] = 0;
// for (i = 0; i < N; i++)
// colsum[j] += M[i][j];
// }
// }
//
//
// /* 参考的列和行求和函数实现 */
// /* 计算矩阵中的每一行、每一列的和。 */
//
// void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
// {
// int i,j;
// for (i = 0; i < N; i++) {
// rowsum[i] = colsum[i] = 0;
// for (j = 0; j < N; j++) {
// rowsum[i] += M[i][j];
// colsum[i] += M[j][i];
// }
// }
// }
//
// /* CUDA优化的列求和函数 */
// void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
// {
// // 分配设备内存
// int *d_M, *d_colsum;
// cudaMalloc(&d_M, N * N * sizeof(int));
// cudaMalloc(&d_colsum, N * sizeof(int));
//
// // 将数据从主机复制到设备
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
//
// // 定义CUDA核函数
// dim3 blockDim(256);
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
//
// // 启动核函数
// cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
//
// // 将结果从设备复制回主机
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
//
// // 释放设备内存
// cudaFree(d_M);
// cudaFree(d_colsum);
// }
//
// /* CUDA优化的行列求和函数 */
// void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
// {
// // 分配设备内存
// int *d_M, *d_rowsum, *d_colsum;
// cudaMalloc(&d_M, N * N * sizeof(int));
// cudaMalloc(&d_rowsum, N * sizeof(int));
// cudaMalloc(&d_colsum, N * sizeof(int));
//
// // 将数据从主机复制到设备
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
//
// // 定义CUDA核函数
// dim3 blockDim(256);
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
//
// // 启动核函数
// cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
//
// // 将结果从设备复制回主机
// cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
//
// // 释放设备内存
// cudaFree(d_M);
// cudaFree(d_rowsum);
// cudaFree(d_colsum);
// }
//
// /* CUDA核函数 - 列求和 */
// __global__ void cudaColumnSum(int *M, int *colsum)
// {
// int col = blockIdx.x * blockDim.x + threadIdx.x;
// if (col < N) {
// colsum[col] = 0;
// for (int row = 0; row < N; row++) {
// colsum[col] += M[row * N + col];
// }
// }
// }
//
// /* CUDA核函数 - 行列求和 */
// __global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
// {
// int idx = blockIdx.x * blockDim.x + threadIdx.x;
// if (idx < N) {
// // 计算行和
// rowsum[idx] = 0;
// for (int j = 0; j < N; j++) {
// rowsum[idx] += M[idx * N + j];
// }
//
// // 计算列和
// colsum[idx] = 0;
// for (int i = 0; i < N; i++) {
// colsum[idx] += M[i * N + idx];
// }
// }
// }
//
// /*
// 这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串"
// COL表示该函数仅仅计算每一列的和
// ROWCOL表示该函数计算每一行、每一列的和
// 将你认为最好的两个实现,放在最前面。
// 比如:
// {my_c_sum1, "超级垃圾列求和实现"},
// {my_rc_sum2, "好一点的行列求和实现"},
// */
//
// rc_fun_rec rc_fun_tab[] =
// {
//
// /* 第一项,应当是你写的最好列求和的函数实现 */
// {cuda_c_sum, COL, "CUDA optimized column sum"},
// /* 第二项,应当是你写的最好行列求和的函数实现 */
// {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
//
// {c_sum, COL, "Column sum, reference implementation"},
//
// {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
//
// /* 下面的代码不能修改或者删除!!表明数组列表结束 */
// {NULL,ROWCOL,NULL}
// };

View File

@@ -0,0 +1,69 @@
/**************************************************************************
??/???????????????????????????????
1. ???????????????????????????????
2. ??????????????????????
3. ??rc_fun_rec rc_fun_tab??????????????????
???????????????????????????????????????????
***************************************************************************/
/*
????201209054233
??????????????
*/
#include "rowcol.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
/* ????????????????? */
/* ???????????????????????????????????????????????
??????????2?????????????????
*/
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
int i, j;
for (j = 0; j < N; j++) {
colsum[j] = 0;
for (i = 0; i < N; i++)
colsum[j] += M[i][j];
}
}
/* ???????????????????? */
/* ??????????????????????? */
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
int i, j;
for (i = 0; i < N; i++) {
rowsum[i] = colsum[i] = 0;
for (j = 0; j < N; j++) {
rowsum[i] += M[i][j];
colsum[i] += M[j][i];
}
}
}
/*
????????????????????????????????????????, COL/ROWCOL, "?????????"??
COL??????????????????????
ROWCOL???????????????????????
?????????????????????????????
????
{my_c_sum1, "?????????????????"},
{my_rc_sum2, "??????????????????"},
*/
rc_fun_rec rc_fun_tab[] = {
/* ???????????????????????????????? */
{c_sum, COL, "Best column sum"},
/* ?????????????????????????????????? */
{rc_sum, ROWCOL, "Best row and column sum"},
{c_sum, COL, "Column sum, reference implementation"},
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
/* ??????????????????????????????????????? */
{NULL, ROWCOL, NULL}};

Binary file not shown.

View File

@@ -1,9 +1,9 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
//#include <random.h> // #include <random.h>
#include "rowcol.h"
#include "fcyc.h"
#include "clock.h" #include "clock.h"
#include "fcyc.h"
#include "rowcol.h"
#define MAX_ITER_COUNT 100 #define MAX_ITER_COUNT 100
@@ -11,9 +11,9 @@
static struct { static struct {
double cref; /* Cycles taken by reference solution */ double cref; /* Cycles taken by reference solution */
double cbest; /* Cycles taken by our best implementation */ double cbest; /* Cycles taken by our best implementation */
} cstandard[2] = } cstandard[2] = {
{{7.7, 6.40}, /* Column Sum */ {7.7, 6.40}, /* Column Sum */
{9.75, 6.60} /* Row & Column Sum */ {9.75, 6.60} /* Row & Column Sum */
}; };
/* Put in code to align matrix so that it starts on a cache block boundary. /* Put in code to align matrix so that it starts on a cache block boundary.
@@ -26,7 +26,7 @@ static struct {
#define WPB 16 #define WPB 16
int verbose = 1; int verbose = 1;
int data[N*N+WPB]; int data[N * N + WPB];
int *mstart; int *mstart;
typedef vector_t *row_t; typedef vector_t *row_t;
@@ -37,137 +37,122 @@ vector_t rsref, csref, rcomp, ccomp;
static void init_tests(void); static void init_tests(void);
extern void make_CPU_busy(void); extern void make_CPU_busy(void);
static void init_tests(void) static void init_tests(void) {
{ int i, j;
int i, j; size_t bytes_per_block = sizeof(int) * WPB;
size_t bytes_per_block = sizeof(int) * WPB; /* round mstart up to nearest block boundary */
/* round mstart up to nearest block boundary */ mstart = (int *)(((size_t)data + bytes_per_block - 1) / bytes_per_block *
mstart = (int *) bytes_per_block);
(((size_t) data + bytes_per_block-1) / bytes_per_block * bytes_per_block); for (i = 0; i < N; i++) {
for (i = 0; i < N; i++) { rsref[i] = csref[i] = 0;
rsref[i] = csref[i] = 0; }
} for (i = 0; i < N; i++) {
for (i = 0; i < N; i++) { for (j = 0; j < N; j++) {
for (j = 0; j < N; j++) { int val = rand();
int val = rand(); mstart[i * N + j] = val;
mstart[i*N+j] = val; rsref[i] += val;
rsref[i] += val; csref[j] += val;
csref[j] += val;
}
} }
}
} }
/* Test function on all values */ /* Test function on all values */
int test_rc(rc_fun f, FILE *rpt, rc_comp_t rc_type) { int test_rc(rc_fun f, FILE *rpt, rc_comp_t rc_type) {
int i; int i;
int ok = 1; int ok = 1;
for (i = 0; i < N; i++) for (i = 0; i < N; i++)
rcomp[i] = ccomp[i] = 0xDEADBEEF; rcomp[i] = ccomp[i] = 0xDEADBEEF;
f((row_t)mstart, rcomp, ccomp); f((row_t)mstart, rcomp, ccomp);
for (i = 0; ok && i < N; i++) {
if (rc_type == ROWCOL
&& rsref[i] != rcomp[i]) {
ok = 0;
if (rpt)
fprintf(rpt,
"对第%d行的计算出错正确结果是%d但是计算得到%d\n",
i, rsref[i], rcomp[i]);
}
if ((rc_type == ROWCOL || rc_type == COL)
&& csref[i] != ccomp[i]) {
ok = 0;
if (rpt)
fprintf(rpt,
"对第%d列的计算出错正确结果是%d但是计算得到%d\n",
i, csref[i], ccomp[i]);
}
for (i = 0; ok && i < N; i++) {
if (rc_type == ROWCOL && rsref[i] != rcomp[i]) {
ok = 0;
if (rpt)
fprintf(rpt, "对第%d行的计算出错正确结果是%d但是计算得到%d\n", i,
rsref[i], rcomp[i]);
} }
return ok; if ((rc_type == ROWCOL || rc_type == COL) && csref[i] != ccomp[i]) {
ok = 0;
if (rpt)
fprintf(rpt, "对第%d列的计算出错正确结果是%d但是计算得到%d\n", i,
csref[i], ccomp[i]);
}
}
return ok;
} }
/* Kludgy way to interface to cycle measuring code */ /* Kludgy way to interface to cycle measuring code */
void do_test(int *intf) void do_test(int *intf) {
{ rc_fun f = (rc_fun)intf;
rc_fun f = (rc_fun) intf;
f((row_t)mstart, rcomp, ccomp); f((row_t)mstart, rcomp, ccomp);
} }
void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp) void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp) {
{ int i;
int i; int *intf = (int *)f;
int *intf = (int *) f;
double t, cme; double t, cme;
t = 0; t = 0;
if (verbose) printf("函数:%s\n", descr); if (verbose)
printf("函数:%s\n", descr);
if (test_rc(f, stdout, rc_type)) { if (test_rc(f, stdout, rc_type)) {
make_CPU_busy(); make_CPU_busy();
for (i=0;i<MAX_ITER_COUNT;i++) for (i = 0; i < MAX_ITER_COUNT; i++)
t += fcyc(do_test, intf); t += fcyc((void (*)(long *))do_test, intf);
t = t/MAX_ITER_COUNT; t = t / MAX_ITER_COUNT;
cme = t/(N*N); cme = t / (N * N);
if (verbose) printf(" 总周期数 = %.2f, 平均周期/元素 = %.2f\n", if (verbose)
t, cme); printf(" 总周期数 = %.2f, 平均周期/元素 = %.2f\n", t, cme);
if (cycp) if (cycp)
*cycp = cme; *cycp = cme;
} }
} }
/* Compute the grade achieved by function */ /* Compute the grade achieved by function */
static double compute_score(double cmeas, double cref, double cbest) static double compute_score(double cmeas, double cref, double cbest) {
{ double sbest = cref / cbest;
double sbest = cref/cbest; double smeas = cref / cmeas;
double smeas = cref/cmeas; if (smeas < 0.1 * (sbest - 1) + 1)
if (smeas < 0.1*(sbest-1)+1)
return 0; return 0;
if (smeas > 1.1*(sbest-1)+1) if (smeas > 1.1 * (sbest - 1) + 1)
return 120; return 120;
return 100*((smeas-1.0)/(sbest-1.0) + 0.1); return 100 * ((smeas - 1.0) / (sbest - 1.0) + 0.1);
} }
int main(int argc, char *argv[]) int main(int argc, char *argv[]) {
{
int i; int i;
double cme; double cme;
double cme_c,cme_rc; double cme_c, cme_rc;
int EnableScore=0; int EnableScore = 0;
if (argc == 3) if (argc == 3) {
{ EnableScore = 1;
EnableScore = 1; verbose = 0;
verbose = 0;
} }
init_tests(); init_tests();
set_fcyc_clear_cache(1); /* Set so that clears cache between runs */ set_fcyc_clear_cache(1); /* Set so that clears cache between runs */
for (i = 0; rc_fun_tab[i].f != NULL; i++) { for (i = 0; rc_fun_tab[i].f != NULL; i++) {
cme = 100.0; cme = 100.0;
time_rc(rc_fun_tab[i].f, time_rc(rc_fun_tab[i].f, rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme);
rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme); if (i == 0) {
if (i == 0) cme_c = cme;
{ if (EnableScore == 0) {
cme_c = cme; printf(" 最高\"列求和\"得分 ======================== %.0f\n",
if (EnableScore==0) compute_score(cme, cstandard[0].cref, cstandard[0].cbest));
{ }
printf(" 最高\"列求和\"得分 ======================== %.0f\n", }
compute_score(cme, cstandard[0].cref, cstandard[0].cbest)); if (i == 1) {
} cme_rc = cme;
} if (EnableScore == 0) {
if (i == 1) printf(" 最高\"行和列求和\"得分 ====================== %.0f\n",
{ compute_score(cme, cstandard[1].cref, cstandard[1].cbest));
cme_rc = cme; }
if (EnableScore==0) }
{
printf(" 最高\"行和列求和\"得分 ====================== %.0f\n",
compute_score(cme, cstandard[1].cref, cstandard[1].cbest));
}
}
} }
if (EnableScore) if (EnableScore)
printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n",cme_c,compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest), printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n", cme_c,
cme_rc,compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest)); compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest), cme_rc,
compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest));
return 0; return 0;
} }

Binary file not shown.

35
perflab/poly/Makefile Normal file
View File

@@ -0,0 +1,35 @@
CC = gcc
NVCC = nvcc
CFLAGS = -Wall -O2 -g
CUDA_FLAGS = -O2 -g
LDFLAGS = -lm -lcudart
# Source files
SRCS = poly_test.c clock.c cpe.c fcyc.c lsquare.c
CUDA_SRCS = poly.cu
OBJS = $(SRCS:.c=.o) poly.o
# Target executable
TARGET = poly_test
# Default target
all: $(TARGET)
# Rule to build the executable
$(TARGET): $(OBJS)
$(CC) $(OBJS) -o $(TARGET) $(LDFLAGS)
# Rule to build object files
%.o: %.c
$(CC) $(CFLAGS) -c $< -o $@
# Rule to build CUDA object files
poly.o: poly.cu
$(NVCC) $(CUDA_FLAGS) -c $< -o $@
# Clean rule
clean:
rm -f $(OBJS) $(TARGET)
# Phony targets
.PHONY: all clean

View File

@@ -13,11 +13,11 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <intrin.h> #include <x86intrin.h>
//#include <intrinsics.h> // #include <intrinsics.h>
#include <windows.h>
#include <time.h>
#include "clock.h" #include "clock.h"
#include <time.h>
#include <windows.h>
/* Use x86 cycle counter */ /* Use x86 cycle counter */
@@ -27,203 +27,195 @@ static unsigned cyc_lo = 0;
/* Set *hi and *lo to the high and low order bits of the cycle counter. /* Set *hi and *lo to the high and low order bits of the cycle counter.
Implementation requires assembly code to use the rdtsc instruction. */ Implementation requires assembly code to use the rdtsc instruction. */
void access_counter(unsigned *hi, unsigned *lo) void access_counter(unsigned *hi, unsigned *lo) {
{
long long counter; long long counter;
counter = __rdtsc(); counter = __rdtsc();
(*hi) = (unsigned int)(counter >> 32); (*hi) = (unsigned int)(counter >> 32);
(*lo) = (unsigned int)counter; (*lo) = (unsigned int)counter;
/* /*
LARGE_INTEGER lPerformanceCount; LARGE_INTEGER lPerformanceCount;
QueryPerformanceCounter(&lPerformanceCount); QueryPerformanceCounter(&lPerformanceCount);
(*hi) = (unsigned int)lPerformanceCount.HighPart; (*hi) = (unsigned int)lPerformanceCount.HighPart;
(*lo) = (unsigned int)lPerformanceCount.LowPart; (*lo) = (unsigned int)lPerformanceCount.LowPart;
// printf("%08X %08X\n",(*hi),(*lo)); // printf("%08X %08X\n",(*hi),(*lo));
*/ */
} }
/* Record the current value of the cycle counter. */ /* Record the current value of the cycle counter. */
void start_counter() void start_counter() { access_counter(&cyc_hi, &cyc_lo); }
{
access_counter(&cyc_hi, &cyc_lo);
}
/* Return the number of cycles since the last call to start_counter. */ /* Return the number of cycles since the last call to start_counter. */
double get_counter() double get_counter() {
{ unsigned ncyc_hi, ncyc_lo;
unsigned ncyc_hi, ncyc_lo; unsigned hi, lo, borrow;
unsigned hi, lo, borrow; double result;
double result;
/* Get cycle counter */ /* Get cycle counter */
access_counter(&ncyc_hi, &ncyc_lo); access_counter(&ncyc_hi, &ncyc_lo);
/* Do double precision subtraction */ /* Do double precision subtraction */
lo = ncyc_lo - cyc_lo; lo = ncyc_lo - cyc_lo;
borrow = cyc_lo > ncyc_lo; borrow = cyc_lo > ncyc_lo;
hi = ncyc_hi - cyc_hi - borrow; hi = ncyc_hi - cyc_hi - borrow;
result = (double) hi * (1 << 30) * 4 + lo; result = (double)hi * (1 << 30) * 4 + lo;
return result; return result;
} }
void make_CPU_busy(void) void make_CPU_busy(void) {
{ volatile double old_tick, new_tick;
volatile double old_tick,new_tick; start_counter();
start_counter(); old_tick = get_counter();
old_tick = get_counter(); new_tick = get_counter();
new_tick = get_counter(); while (new_tick - old_tick < 1000000000)
while (new_tick - old_tick < 1000000000) new_tick = get_counter();
new_tick = get_counter();
} }
//CPU的频率 // CPU<EFBFBD><EFBFBD>Ƶ<EFBFBD><EFBFBD>
double mhz(int verbose) double mhz(int verbose) {
{ LARGE_INTEGER lFrequency;
LARGE_INTEGER lFrequency; LARGE_INTEGER lPerformanceCount_Start;
LARGE_INTEGER lPerformanceCount_Start; LARGE_INTEGER lPerformanceCount_End;
LARGE_INTEGER lPerformanceCount_End; double mhz;
double mhz; double fTime;
double fTime; __int64 _i64StartCpuCounter;
__int64 _i64StartCpuCounter; __int64 _i64EndCpuCounter;
__int64 _i64EndCpuCounter; // On a multiprocessor machine, it should not matter which processor is
//On a multiprocessor machine, it should not matter which processor is called. // called. However, you can get different results on different processors due
//However, you can get different results on different processors due to bugs in // to bugs in the BIOS or the HAL. To specify processor affinity for a thread,
//the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function. // use the SetThreadAffinityMask function.
HANDLE hThread=GetCurrentThread(); HANDLE hThread = GetCurrentThread();
SetThreadAffinityMask(hThread,0x1); SetThreadAffinityMask(hThread, 0x1);
//主板上高精度定时器的晶振频率 // <20><><EFBFBD><EFBFBD><EFBFBD>ϸ߾<CFB8><DFBE>ȶ<EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD>ľ<EFBFBD><C4BE><EFBFBD>Ƶ<EFBFBD><C6B5>
//这个定时器应该就是一片8253或者8254 // <20><><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1>Ӧ<EFBFBD>þ<EFBFBD><C3BE><EFBFBD>һƄ1<C684>78253<35><33><EFBFBD><EFBFBD>8254
//intel ich7中集成了8254 // <20><>intel ich7<EFBFBD>м<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>8254
QueryPerformanceFrequency(&lFrequency); QueryPerformanceFrequency(&lFrequency);
// if (verbose>0) // if (verbose>0)
// printf("高精度定时器的晶振频率:%1.0fHz.\n",(double)lFrequency.QuadPart); // printf("<EFBFBD>߾<EFBFBD><EFBFBD>ȶ<EFBFBD>ʱ<EFBFBD><EFBFBD><EFBFBD>ľ<EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD>ʣ<EFBFBD>%1.0fHz.\n",(double)lFrequency.QuadPart);
//这个定时器每经过一个时钟周期,其计数器会+1 // <20><><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1>ÿ<EFBFBD><C3BF><EFBFBD><EFBFBD>һ<EFBFBD><D2BB>ʱ<EFBFBD><CAB1><EFBFBD><EFBFBD><EFBFBD>ڣ<EFBFBD><DAA3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>+1
QueryPerformanceCounter(&lPerformanceCount_Start); QueryPerformanceCounter(&lPerformanceCount_Start);
//RDTSC指令:获取CPU经历的时钟周期数 // RDTSCָ<EFBFBD><EFBFBD>:<3A><>ȡCPU<50><55><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
_i64StartCpuCounter=__rdtsc(); _i64StartCpuCounter = __rdtsc();
//延时长一点,误差会小一点 // <20><>ʱ<EFBFBD><CAB1>һ<EFBFBD><D2BB>,<2C><><EFBFBD><EFBFBD>Сһ<D0A1><D2BB>
//int nTemp=100000; // int nTemp=100000;
//while (--nTemp); // while (--nTemp);
Sleep(200); Sleep(200);
QueryPerformanceCounter(&lPerformanceCount_End); QueryPerformanceCounter(&lPerformanceCount_End);
_i64EndCpuCounter=__rdtsc(); _i64EndCpuCounter = __rdtsc();
//f=1/T => f=计数次数/(计数次数*T) // f=1/T => f=<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>/(<28><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*T)
//这里的“计数次数*T”就是时间差 // <20><><EFBFBD><EFBFBD>ġ<EFBFBD><C4A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ᅣ1<EFBF84>7*T<><54><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1>ᅣ1<EFBF84>7
fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart) fTime = ((double)lPerformanceCount_End.QuadPart -
/(double)lFrequency.QuadPart; (double)lPerformanceCount_Start.QuadPart) /
(double)lFrequency.QuadPart;
mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0); mhz = (_i64EndCpuCounter - _i64StartCpuCounter) / (fTime * 1000000.0);
if (verbose>0) if (verbose > 0)
printf("CPU频率为:%1.6fMHz.\n",mhz); printf("CPUƵ<EFBFBD><EFBFBD>Ϊ:%1.6fMHz.\n", mhz);
return mhz; return mhz;
} }
double CPU_Factor1(void) double CPU_Factor1(void) {
{ double result;
double result; int i, j, k, ii, jj, kk;
int i,j,k,ii,jj,kk; LARGE_INTEGER lStart, lEnd;
LARGE_INTEGER lStart,lEnd;
LARGE_INTEGER lFrequency; LARGE_INTEGER lFrequency;
HANDLE hThread; HANDLE hThread;
double fTime; double fTime;
QueryPerformanceFrequency(&lFrequency); QueryPerformanceFrequency(&lFrequency);
ii = 43273; ii = 43273;
kk = 1238; kk = 1238;
result = 1; result = 1;
jj = 1244; jj = 1244;
hThread=GetCurrentThread(); hThread = GetCurrentThread();
SetThreadAffinityMask(hThread,0x1); SetThreadAffinityMask(hThread, 0x1);
QueryPerformanceCounter(&lStart); QueryPerformanceCounter(&lStart);
//_asm("cpuid"); //_asm("cpuid");
start_counter(); start_counter();
for (i=0;i<100;i++) for (i = 0; i < 100; i++)
for (j=0;j<1000;j++) for (j = 0; j < 1000; j++)
for (k=0;k<1000;k++) for (k = 0; k < 1000; k++)
kk += kk*ii+jj; kk += kk * ii + jj;
result = get_counter(); result = get_counter();
QueryPerformanceCounter(&lEnd); QueryPerformanceCounter(&lEnd);
fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart); fTime = ((double)lEnd.QuadPart - (double)lStart.QuadPart);
printf("CPU运行时间为%f",result); printf("CPU<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>Ϊ%f", result);
printf("\t %f\n",fTime); printf("\t %f\n", fTime);
return result; return result;
} }
double CPU_Factor(void) double CPU_Factor(void) {
{ double frequency;
double frequency; double multiplier = 1000 * 1000 * 1000; // nano
double multiplier = 1000 * 1000 * 1000;//nano LARGE_INTEGER lFrequency;
LARGE_INTEGER lFrequency; LARGE_INTEGER start, stop;
LARGE_INTEGER start,stop; HANDLE hThread;
HANDLE hThread; int i;
int i; const int gigahertz = 1000 * 1000 * 1000;
const int gigahertz= 1000*1000*1000; const int known_instructions_per_loop = 27317;
const int known_instructions_per_loop = 27317;
int iterations = 100000000; int iterations = 100000000;
int g = 0; int g = 0;
double normal_ticks_per_second; double normal_ticks_per_second;
double ticks; double ticks;
double time; double time;
double loops_per_sec; double loops_per_sec;
double instructions_per_loop; double instructions_per_loop;
double ratio; double ratio;
double actual_freq; double actual_freq;
QueryPerformanceFrequency(&lFrequency); QueryPerformanceFrequency(&lFrequency);
frequency = (double)lFrequency.QuadPart; frequency = (double)lFrequency.QuadPart;
hThread=GetCurrentThread(); hThread = GetCurrentThread();
SetThreadAffinityMask(hThread,0x1); SetThreadAffinityMask(hThread, 0x1);
QueryPerformanceCounter(&start); QueryPerformanceCounter(&start);
for( i = 0; i < iterations; i++) for (i = 0; i < iterations; i++) {
{ g++;
g++; g++;
g++; g++;
g++; g++;
g++; }
} QueryPerformanceCounter(&stop);
QueryPerformanceCounter(&stop);
//normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199 // normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ
normal_ticks_per_second = frequency * 1000; // 3199
ticks = (double)((double)stop.QuadPart - (double)start.QuadPart); normal_ticks_per_second = frequency * 1000;
time = (ticks * multiplier) /frequency; ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
loops_per_sec = iterations / (time/multiplier); time = (ticks * multiplier) / frequency;
instructions_per_loop = normal_ticks_per_second / loops_per_sec; loops_per_sec = iterations / (time / multiplier);
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
ratio = (instructions_per_loop / known_instructions_per_loop); ratio = (instructions_per_loop / known_instructions_per_loop);
actual_freq = normal_ticks_per_second / ratio; actual_freq = normal_ticks_per_second / ratio;
/* /*
actual_freq = normal_ticks_per_second / ratio; actual_freq = normal_ticks_per_second / ratio;
actual_freq = known_instructions_per_loop*iterations*multiplier/time; actual_freq = known_instructions_per_loop*iterations*multiplier/time;
2293 = x/time; 2293 = x/time;
2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000 2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
loops_per_sec = iterations*frequency / ticks loops_per_sec = iterations*frequency / ticks
instructions_per_loop = / loops_per_sec; instructions_per_loop = / loops_per_sec;
*/ */
printf("Perf counter freq: %f\n", normal_ticks_per_second); printf("Perf counter freq: %f\n", normal_ticks_per_second);
printf("Loops per sec: %f\n", loops_per_sec); printf("Loops per sec: %f\n", loops_per_sec);
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop); printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
printf("Presumed freq: %f\n", actual_freq); printf("Presumed freq: %f\n", actual_freq);
printf("ratio: %f\n", ratio); printf("ratio: %f\n", ratio);
printf("time=%f\n",time); printf("time=%f\n", time);
return ratio; return ratio;
} }

325
perflab/poly/poly.cu Normal file
View File

@@ -0,0 +1,325 @@
/**************************************************************************
多项式计算函数。按下面的要求编辑此文件:
1. 将你的学号、姓名,以注释的方式写到下面;
2. 实现不同版本的多项式计算函数;
3. 编辑peval_fun_rec peval_fun_tab数组将你的最好的答案
最小CPE、最小C10作为数组的前两项
***************************************************************************/
/*
学号201209054233
姓名:夜半加班狂
*/
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
typedef int (*peval_fun)(int*, int, int);
typedef struct {
peval_fun f;
char *descr;
} peval_fun_rec, *peval_fun_ptr;
/**************************************************************************
Edit this comment to indicate your name and Andrew ID
#ifdef ASSIGN
Submission by Harry Q. Bovik, bovik@andrew.cmu.edu
#else
Instructor's version.
Created by Randal E. Bryant, Randy.Bryant@cs.cmu.edu, 10/07/02
#endif
***************************************************************************/
/*
实现一个指定的常系数多项式计算
第一次,请直接运行程序,以便获知你需要实现的常系数是啥
*/
int const_poly_eval(int *not_use, int not_use2, int x)
{
int result = 0;
/* int i;
int xpwr = 1; // x的幂次
int a[4] = {21,90,42,88};
for (i = 0; i <= 3; i++) {
result += a[i]*xpwr;
xpwr *= x;
}
*/
// 90 = 64 + 32 - 4 - 2
// 42 = 32 + 8 + 2
// 88 = 64 + 16 + 8
int x64,x32,x16,x8,x4,x2;
x64 = x << 6;
x32 = x << 5;
x16 = x << 4;
x8 = x << 3;
x4 = x << 2;
x2 = x << 1;
result = 21 + x64+x32-x4-x2 + ((x32+x8+x2) + (x64+x16+x8)*x)*x;
return result;
}
/* 多项式计算函数。注意:这个只是一个参考实现,你需要实现自己的版本 */
/*
友情提示lcc支持ATT格式的嵌入式汇编例如
_asm("movl %eax,%ebx");
_asm("pushl %edx");
可以在lcc中project->configuration->Compiler->Code Generation->Generate .asm
将其选中后可以在lcc目录下面生成对应程序的汇编代码实现。通过查看汇编文件
你可以了解编译器是如何实现你的代码的。有些实现可能非常低效。
你可以在适当的地方加入嵌入式汇编,来大幅度提高计算性能。
*/
int poly_eval(int *a, int degree, int x)
{
int result = 0;
int i;
int xpwr = 1; /* x的幂次 */
// printf("阶=%d\n",degree);
for (i = 0; i <= degree; i++) {
result += a[i]*xpwr;
xpwr *= x;
}
return result;
}
/* CUDA优化的多项式计算函数 - 低CPE版本 */
int cuda_poly_eval_low_cpe(int *a, int degree, int x)
{
// 对于低CPE版本我们使用CUDA并行计算多项式的各个项
// 然后将结果传回主机进行求和
// 分配设备内存
int *d_a, *d_results;
cudaError_t err;
// 分配内存
err = cudaMalloc(&d_a, (degree + 1) * sizeof(int));
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
return 0;
}
err = cudaMalloc(&d_results, (degree + 1) * sizeof(int));
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
cudaFree(d_a);
return 0;
}
// 将系数从主机复制到设备
err = cudaMemcpy(d_a, a, (degree + 1) * sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
cudaFree(d_a);
cudaFree(d_results);
return 0;
}
// 定义CUDA核函数
dim3 blockDim(256);
dim3 gridDim((degree + 1 + blockDim.x - 1) / blockDim.x);
// 启动核函数
cudaPolyEvalLowCPE<<<gridDim, blockDim>>>(d_a, degree, x, d_results);
// 检查核函数执行错误
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
cudaFree(d_a);
cudaFree(d_results);
return 0;
}
// 分配主机内存用于结果
int *h_results = (int *)malloc((degree + 1) * sizeof(int));
if (h_results == NULL) {
printf("Memory allocation error\n");
cudaFree(d_a);
cudaFree(d_results);
return 0;
}
// 将结果从设备复制回主机
err = cudaMemcpy(h_results, d_results, (degree + 1) * sizeof(int), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
free(h_results);
cudaFree(d_a);
cudaFree(d_results);
return 0;
}
// 在主机上求和
int result = 0;
for (int i = 0; i <= degree; i++) {
result += h_results[i];
}
// 释放内存
free(h_results);
cudaFree(d_a);
cudaFree(d_results);
return result;
}
/* CUDA优化的多项式计算函数 - 10阶优化版本 */
int cuda_poly_eval_degree10(int *a, int degree, int x)
{
// 对于10阶多项式我们可以使用更优化的方法
// 使用CUDA并行计算但针对10阶多项式进行特殊优化
// 分配设备内存
int *d_a, *d_result;
cudaError_t err;
// 分配内存
err = cudaMalloc(&d_a, (degree + 1) * sizeof(int));
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
return 0;
}
err = cudaMalloc(&d_result, sizeof(int));
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
cudaFree(d_a);
return 0;
}
// 将系数从主机复制到设备
err = cudaMemcpy(d_a, a, (degree + 1) * sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
cudaFree(d_a);
cudaFree(d_result);
return 0;
}
// 定义CUDA核函数
dim3 blockDim(256);
dim3 gridDim(1); // 只需要一个块,因为我们只需要一个结果
// 启动核函数
cudaPolyEvalDegree10<<<gridDim, blockDim>>>(d_a, degree, x, d_result);
// 检查核函数执行错误
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
cudaFree(d_a);
cudaFree(d_result);
return 0;
}
// 获取结果
int result;
err = cudaMemcpy(&result, d_result, sizeof(int), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
cudaFree(d_a);
cudaFree(d_result);
return 0;
}
// 释放内存
cudaFree(d_a);
cudaFree(d_result);
return result;
}
/* CUDA核函数 - 低CPE版本 */
__global__ void cudaPolyEvalLowCPE(int *a, int degree, int x, int *results)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx <= degree) {
// 计算x的幂
int xpwr = 1;
for (int i = 0; i < idx; i++) {
xpwr *= x;
}
// 计算这一项的结果
results[idx] = a[idx] * xpwr;
}
}
/* CUDA核函数 - 10阶优化版本 */
__global__ void cudaPolyEvalDegree10(int *a, int degree, int x, int *result)
{
// 使用共享内存来存储中间结果
__shared__ int shared_result;
// 只有第一个线程初始化共享结果
if (threadIdx.x == 0) {
shared_result = 0;
}
__syncthreads();
// 每个线程计算一部分项
int local_result = 0;
int xpwr = 1;
// 计算x的幂
for (int i = 0; i < threadIdx.x; i++) {
xpwr *= x;
}
// 计算这一项的结果
if (threadIdx.x <= degree) {
local_result = a[threadIdx.x] * xpwr;
}
// 使用原子操作累加结果
atomicAdd(&shared_result, local_result);
// 同步所有线程
__syncthreads();
// 只有第一个线程将结果写回全局内存
if (threadIdx.x == 0) {
*result = shared_result;
}
}
/*
这个表格包含多个数组元素,每一组元素(函数名字, "描述字符串"
将你认为最好的两个实现,放在最前面。
比如:
{my_poly_eval1, "超级垃圾实现"},
{my_poly_eval2, "好一点的实现"},
*/
peval_fun_rec peval_fun_tab[] =
{
/* 第一项应当是你写的最好CPE的函数实现 */
{cuda_poly_eval_low_cpe, "CUDA optimized low CPE implementation"},
/* 第二项应当是你写的在10阶时具有最好性能的实现 */
{cuda_poly_eval_degree10, "CUDA optimized degree 10 implementation"},
{poly_eval, "poly_eval: 参考实现"},
/* 下面的代码不能修改或者删除!!表明数组列表结束 */
{NULL, ""}
};

BIN
perflab/poly/poly.o Normal file

Binary file not shown.

View File

@@ -6,6 +6,7 @@
#include "poly.h" #include "poly.h"
#include "cpe.h" #include "cpe.h"
#include "clock.h" #include "clock.h"
#include <time.h>
double CPU_Mhz; double CPU_Mhz;
@@ -17,7 +18,7 @@ static int coeff[MAXDEGREE+1];
#define MAX_ITER_COUNT 100 #define MAX_ITER_COUNT 100
#define REF_CPU_MHZ 2292.6 // 这是我的处理器主频 #define REF_CPU_MHZ 2292.6 // <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ҵĴ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ
/* Define performance standards */ /* Define performance standards */
static struct { static struct {
@@ -26,7 +27,7 @@ static struct {
} cstandard[3] = } cstandard[3] =
{{4.00, 1.75}, /* CPE */ {{4.00, 1.75}, /* CPE */
{50, 43}, /* C(10) */ {50, 43}, /* C(10) */
{57,31} /* 常系数多项式计算 */ {57,31} /* <EFBFBD><EFBFBD>ϵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
}; };
int coeff_const[4]; int coeff_const[4];
@@ -82,7 +83,7 @@ static void init_const_poly(void)
coeff_const[i] = rand_div+10; coeff_const[i] = rand_div+10;
} }
printf("你需要修改poly.cconst_poly_eval函数,实现下面的常数多项式计算!\n"); printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD>޸<EFBFBD>poly.c<EFBFBD><EFBFBD>const_poly_eval<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ij<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD>\n");
printf("\tresult=%d+%d*x+%d*x^2+%d*x^3\n",coeff_const[0],coeff_const[1],coeff_const[2],coeff_const[3]); printf("\tresult=%d+%d*x+%d*x^2+%d*x^3\n",coeff_const[0],coeff_const[1],coeff_const[2],coeff_const[3]);
fixval_const = ref_poly_eval(coeff_const, 3, xval); fixval_const = ref_poly_eval(coeff_const, 3, xval);
@@ -97,15 +98,15 @@ void test_const_poly(void)
int my_cal = const_poly_eval(coeff_const, 3, xval); int my_cal = const_poly_eval(coeff_const, 3, xval);
if (fixval_const != my_cal) if (fixval_const != my_cal)
{ {
printf("常系数多项式计算const_poly_eval实现错误x=%d预期结果是%d但是计算得到的是%d\n",xval,fixval_const,my_cal); printf("<EFBFBD><EFBFBD>ϵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>const_poly_evalʵ<EFBFBD>ִ<EFBFBD><EFBFBD><EFBFBD>x=%d<><64><EFBFBD><EFBFBD>Ԥ<EFBFBD>ڽ<EFBFBD><DABD><EFBFBD><EFBFBD>%d<><64><EFBFBD><EFBFBD><EFBFBD>Ǽ<EFBFBD><C7BC><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD>%d\n",xval,fixval_const,my_cal);
exit(0); exit(0);
} }
fix_time = 0; fix_time = 0;
for (i=0;i<MAX_ITER_COUNT;i++) for (i=0;i<MAX_ITER_COUNT;i++)
fix_time += measure_function(run_fun_const, 3); fix_time += measure_function(run_fun_const, 3);
fix_time = fix_time / MAX_ITER_COUNT; fix_time = fix_time / MAX_ITER_COUNT;
printf(" 常系数多项式计算时间 = %.1f\n", fix_time); printf(" <EFBFBD><EFBFBD>ϵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD> = %.1f\n", fix_time);
printf(" 最高的常系数多项式计算得分 ============== %.0f\n", printf(" <EFBFBD><EFBFBD>ߵij<EFBFBD>ϵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>÷<EFBFBD> ============== %.0f\n",
compute_score(fix_time, cstandard[2].cref, cstandard[2].cbest)); compute_score(fix_time, cstandard[2].cref, cstandard[2].cbest));
} }
@@ -132,7 +133,7 @@ int test_poly(peval_fun f, FILE *rpt) {
ok = 0; ok = 0;
if (rpt) { if (rpt) {
fprintf(rpt, fprintf(rpt,
"错误!多项式计算不对!阶=%d时计算的值是%d而正确值是%d\n", "<EFBFBD><EFBFBD><EFBFBD>󣡶<EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ԣ<EFBFBD><EFBFBD><EFBFBD>=%dʱ<64><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5>%d<><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȷֵ<C8B7><D6B5>%d\n",
MAXDEGREE-i, v, pval[i]); MAXDEGREE-i, v, pval[i]);
} }
} }
@@ -142,7 +143,7 @@ int test_poly(peval_fun f, FILE *rpt) {
ok = 0; ok = 0;
if (rpt) { if (rpt) {
fprintf(rpt, fprintf(rpt,
"错误!多项式计算不对!阶=%d时计算的值是%d而正确值是%d\n", "<EFBFBD><EFBFBD><EFBFBD>󣡶<EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ԣ<EFBFBD><EFBFBD><EFBFBD>=%dʱ<64><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5>%d<><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȷֵ<C8B7><D6B5>%d\n",
FIXDEGREE, v, fixval); FIXDEGREE, v, fixval);
} }
} }
@@ -175,7 +176,7 @@ void run_poly(peval_fun f, char *descr, double *cpep, double *cfixp)
double cpe=0; double cpe=0;
double fix_time=0; double fix_time=0;
pfun = f; pfun = f;
printf("函数:%s\n", descr); printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>%s\n", descr);
if (test_poly(f, stdout)) { if (test_poly(f, stdout)) {
cpe = 0; cpe = 0;
for (i=0;i<MAX_ITER_COUNT;i++) for (i=0;i<MAX_ITER_COUNT;i++)
@@ -206,7 +207,7 @@ static double compute_score(double cmeas, double cref, double cbest)
return 100*((smeas-1.0)/(sbest-1.0) + 0.1); return 100*((smeas-1.0)/(sbest-1.0) + 0.1);
} }
/* 产生一个0~divv-1之间的随机数,同时更新随机数种子 */ /* <EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><EFBFBD>0~divv-1֮<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͬʱ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
void GenerateRandomNumber(unsigned long divv) void GenerateRandomNumber(unsigned long divv)
{ {
unsigned long long x = rand1_h; unsigned long long x = rand1_h;
@@ -230,18 +231,18 @@ int main(int argc, char *argv[])
// CPU_Factor(); // CPU_Factor();
// GetCpuClock(); // GetCpuClock();
printf("\t2015多项式优化实验,欢迎你!\n"); printf("\t2015<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD>Ż<EFBFBD>ʵ<EFBFBD><EFBFBD><EFBFBD>ӭ<EFBFBD>\n");
printf("============================\n"); printf("============================\n");
if (argc == 1) if (argc == 1)
{ {
printf("使用方法:%s 学号后6位 [学号后6位] [学号后6位] ...\n",argv[0]); printf("ʹ<EFBFBD>÷<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>%s ѧ<>ź<EFBFBD>6λ [ѧ<>ź<EFBFBD>6λ] [ѧ<>ź<EFBFBD>] ...\n",argv[0]);
printf("你需要依据提示改写poly.c程序实现一个常系数多项式的计算尽可能快哦....\n"); printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʾ<EFBFBD><EFBFBD>дpoly.c<><63><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5>һ<EFBFBD><D2BB><EFBFBD><EFBFBD>ϵ<EFBFBD><CFB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD>ļ<EFBFBD><C4BC><EFBFBD><E3A3AC><EFBFBD><EFBFBD><EFBFBD>ܿ<EFBFBD>Ŷ....\n");
printf("另外你需要改写poly.c程序实现任意阶的多项式计算和10阶的多项式计算要快\n"); printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><EFBFBD>дpoly.c<><63><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>׵Ķ<D7B5><C4B6><EFBFBD>ʽ<EFBFBD><CABD><EFBFBD><EFBFBD><EFBFBD>10<31>׵Ķ<D7B5><C4B6><EFBFBD>ʽ<EFBFBD><CABD><EFBFBD>㣬Ҫ<E3A3AC>\n");
return 0; return 0;
} }
/*依据学号,初始化一个随机数发生器*/ /*<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ѧ<EFBFBD>ţ<EFBFBD><EFBFBD><EFBFBD>ʼ<EFBFBD><EFBFBD>һ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*/
rand1_h = (unsigned long)atoi(argv[1]); rand1_h = (unsigned long)atoi(argv[1]);
rand1_l=0x29A; rand1_l=0x29A;
GenerateRandomNumber(0); GenerateRandomNumber(0);
@@ -266,10 +267,10 @@ int main(int argc, char *argv[])
//make_CPU_busy(); //make_CPU_busy();
run_poly(peval_fun_tab[i].f, peval_fun_tab[i].descr, &cpe, &cfix); run_poly(peval_fun_tab[i].f, peval_fun_tab[i].descr, &cpe, &cfix);
if (i == 0) if (i == 0)
printf(" 最高的CPE得分 =========================== %.0f\n", printf(" <EFBFBD><EFBFBD>ߵ<EFBFBD>CPE<EFBFBD>÷<EFBFBD> =========================== %.0f\n",
compute_score(cpe, cstandard[0].cref, cstandard[0].cbest)); compute_score(cpe, cstandard[0].cref, cstandard[0].cbest));
if (i == 1) if (i == 1)
printf(" 最高的C(10)得分 ========================= %.0f\n", printf(" <EFBFBD><EFBFBD>ߵ<EFBFBD>C(10)<EFBFBD>÷<EFBFBD> ========================= %.0f\n",
compute_score(cfix, cstandard[1].cref, cstandard[1].cbest)); compute_score(cfix, cstandard[1].cref, cstandard[1].cbest));
} }
return 0; return 0;

BIN
perflab/poly/poly_test.o Normal file

Binary file not shown.