perflab added

2025-04-12 10:18:45 +08:00
parent a04c35be04
commit 4ea99d81a7
51 changed files with 350295 additions and 0 deletions
--- a/perflab/matrix/clock.c
+++ b/perflab/matrix/clock.c
@@ -0,0 +1,229 @@
+/* clock.c
+ * Retrofitted to use thread-specific timers
+ * and to get clock information from /proc/cpuinfo
+ * (C) R. E. Bryant, 2010
+ *
+ */
+
+/* When this constant is not defined, uses time stamp counter */
+#define USE_POSIX 0
+
+/* Choice to use cpu_gettime call or Intel time stamp counter directly */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <intrin.h>
+//#include <intrinsics.h>
+#include <windows.h>
+#include <time.h>
+#include "clock.h"
+
+/* Use x86 cycle counter */
+
+/* Initialize the cycle counter */
+static unsigned cyc_hi = 0;
+static unsigned cyc_lo = 0;
+
+/* Set *hi and *lo to the high and low order bits  of the cycle counter.
+   Implementation requires assembly code to use the rdtsc instruction. */
+void access_counter(unsigned *hi, unsigned *lo)
+{
+
+	long long counter;
+
+	counter = __rdtsc();
+	(*hi) = (unsigned int)(counter >> 32);
+	(*lo) = (unsigned int)counter;
+/*
+
+	LARGE_INTEGER lPerformanceCount;
+
+	QueryPerformanceCounter(&lPerformanceCount);
+	(*hi) = (unsigned int)lPerformanceCount.HighPart;
+	(*lo) = (unsigned int)lPerformanceCount.LowPart;
+//	printf("%08X %08X\n",(*hi),(*lo));
+*/
+}
+
+
+/* Record the current value of the cycle counter. */
+void start_counter()
+{
+    access_counter(&cyc_hi, &cyc_lo);
+}
+
+/* Return the number of cycles since the last call to start_counter. */
+double get_counter()
+{
+    unsigned ncyc_hi, ncyc_lo;
+    unsigned hi, lo, borrow;
+    double result;
+
+    /* Get cycle counter */
+    access_counter(&ncyc_hi, &ncyc_lo);
+
+    /* Do double precision subtraction */
+    lo = ncyc_lo - cyc_lo;
+    borrow = cyc_lo > ncyc_lo;
+    hi = ncyc_hi - cyc_hi - borrow;
+    result = (double) hi * (1 << 30) * 4 + lo;
+    return result;
+}
+void make_CPU_busy(void)
+{
+	volatile double old_tick,new_tick;
+	start_counter();
+	old_tick = get_counter();
+	new_tick = get_counter();
+	while (new_tick - old_tick < 1000000000)
+		new_tick = get_counter();
+}
+
+//CPU的频率
+double mhz(int verbose)
+{
+    LARGE_INTEGER lFrequency;
+    LARGE_INTEGER lPerformanceCount_Start;
+    LARGE_INTEGER lPerformanceCount_End;
+	double mhz;
+	double fTime;
+	__int64 _i64StartCpuCounter;
+	__int64 _i64EndCpuCounter;
+    //On a multiprocessor machine, it should not matter which processor is called.
+    //However, you can get different results on different processors due to bugs in
+    //the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function.
+    HANDLE hThread=GetCurrentThread();
+    SetThreadAffinityMask(hThread,0x1);
+
+    //主板上高精度定时器的晶振频率
+    //这个定时器应该就是一片8253或者8254
+    //在intel ich7中集成了8254
+    QueryPerformanceFrequency(&lFrequency);
+//    if (verbose>0)
+//    	printf("高精度定时器的晶振频率：%1.0fHz.\n",(double)lFrequency.QuadPart);
+
+    //这个定时器每经过一个时钟周期，其计数器会+1
+    QueryPerformanceCounter(&lPerformanceCount_Start);
+
+    //RDTSC指令:获取CPU经历的时钟周期数
+    _i64StartCpuCounter=__rdtsc();
+
+    //延时长一点,误差会小一点
+    //int nTemp=100000;
+    //while (--nTemp);
+    Sleep(200);
+
+    QueryPerformanceCounter(&lPerformanceCount_End);
+
+    _i64EndCpuCounter=__rdtsc();
+
+    //f=1/T => f=计数次数/(计数次数*T)
+    //这里的“计数次数*T”就是时间差
+    fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart)
+        /(double)lFrequency.QuadPart;
+
+ 		mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0);
+    if (verbose>0)
+    	printf("CPU频率为:%1.6fMHz.\n",mhz);
+    return mhz;
+}
+
+double CPU_Factor1(void)
+{
+	double result;
+	int i,j,k,ii,jj,kk;
+	LARGE_INTEGER lStart,lEnd;
+  LARGE_INTEGER lFrequency;
+  HANDLE hThread;
+  double fTime;
+
+  QueryPerformanceFrequency(&lFrequency);
+
+	ii = 43273;
+	kk = 1238;
+	result = 1;
+	jj = 1244;
+
+    hThread=GetCurrentThread();
+    SetThreadAffinityMask(hThread,0x1);
+  QueryPerformanceCounter(&lStart);
+  //_asm("cpuid");
+	start_counter();
+	for (i=0;i<100;i++)
+		for (j=0;j<1000;j++)
+			for (k=0;k<1000;k++)
+				kk += kk*ii+jj;
+
+	result = get_counter();
+	QueryPerformanceCounter(&lEnd);
+  fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart);
+	printf("CPU运行时间为%f",result);
+	printf("\t %f\n",fTime);
+	return result;
+}
+
+double CPU_Factor(void)
+{
+ double frequency;
+ double multiplier = 1000 * 1000 * 1000;//nano
+ LARGE_INTEGER lFrequency;
+ LARGE_INTEGER start,stop;
+ HANDLE hThread;
+ int i;
+ const int gigahertz= 1000*1000*1000;
+ const int known_instructions_per_loop = 27317; 
+
+ int iterations = 100000000;
+ int g = 0;
+ double normal_ticks_per_second;
+double ticks;
+double time;
+double loops_per_sec;
+double instructions_per_loop;
+double ratio;
+double actual_freq;
+
+ QueryPerformanceFrequency(&lFrequency);
+ frequency = (double)lFrequency.QuadPart;
+
+ hThread=GetCurrentThread();
+ SetThreadAffinityMask(hThread,0x1);
+ QueryPerformanceCounter(&start);
+ for( i = 0; i < iterations; i++)
+ {
+   g++;
+   g++;
+   g++;
+   g++;
+ }
+ QueryPerformanceCounter(&stop);
+
+ //normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199
+ normal_ticks_per_second = frequency * 1000;
+ ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
+ time = (ticks * multiplier) /frequency;
+ loops_per_sec = iterations / (time/multiplier);
+ instructions_per_loop = normal_ticks_per_second  / loops_per_sec;
+
+ ratio = (instructions_per_loop / known_instructions_per_loop);
+ actual_freq = normal_ticks_per_second / ratio;
+/* 
+ actual_freq = normal_ticks_per_second / ratio;
+ actual_freq = known_instructions_per_loop*iterations*multiplier/time;
+
+	2293 = x/time;
+	
+	2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
+ loops_per_sec = iterations*frequency / ticks
+ 
+ instructions_per_loop =   / loops_per_sec;
+*/ 
+ printf("Perf counter freq: %f\n", normal_ticks_per_second);
+ printf("Loops per sec:      %f\n", loops_per_sec);
+ printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
+ printf("Presumed freq: %f\n", actual_freq);
+ printf("ratio: %f\n", ratio);
+ printf("time=%f\n",time);
+ return ratio;
+}
--- a/perflab/matrix/clock.h
+++ b/perflab/matrix/clock.h
@@ -0,0 +1,12 @@
+/* Routines for using cycle counter */
+
+/* Start the counter */
+void start_counter(void);
+
+/* Get # cycles since counter started.  Returns 1e20 if detect timing anomaly */
+double get_counter(void);
+void make_CPU_busy(void);
+
+double mhz(int verbose);
+double CPU_Factor(void);
+//double GetCpuClock(void);
--- a/perflab/matrix/cpe.c
+++ b/perflab/matrix/cpe.c
@@ -0,0 +1,117 @@
+/* Compute CPE for function */
+#include <stdlib.h>
+#include <stdio.h>
+#include "fcyc.h"
+#include "cpe.h"
+#include "lsquare.h"
+#include "clock.h"
+
+/* Find number of cycles taken by function.
+   Do this by running number of trials until best two within TOL of
+   each other
+*/
+double measure_function(elem_fun_t f, int cnt)
+{
+    /* Need to fudge fact that fcyc wants a function taking an
+       long int *, while our function takes an long int */
+    test_funct tf = (test_funct) f;
+    return fcyc(tf, (int *) (int) cnt);
+}
+
+#define MAXCNT 100
+
+#define LIM RAND_MAX
+
+/* LCM of unrolling degree */
+#ifdef USE_UNI
+#define UNROLL 32
+#else /* USE_UNI */
+#define UNROLL 1
+#endif
+
+static long int get_cnt(long int index, long int samples,
+		   long int maxcnt, sample_t smethod, double bias)
+{
+    long int mincnt = (long int) (bias*maxcnt);
+    double weight;
+    long int val;
+    switch (smethod) {
+    case UNI_SAMPLE:
+	weight = (double) index/(samples - 1);
+	break;
+    case RAN_SAMPLE:
+	weight = (double) (rand() % LIM) / (double) (LIM-1);
+	break;
+    default:
+	fprintf(stderr, "Undefined sampling method %d\n", smethod);
+	exit(1);
+    }
+    val = mincnt + weight*(maxcnt-mincnt);
+    return UNROLL * (val/UNROLL);
+}
+
+#define SEED 31415
+
+/* Find cpe for function f, which allows cnt up to maxcnt, using
+   specified number of sample points.
+   If data_file, then print data so that can plot points with Excel
+   smethod determines method for generating samples
+*/
+double find_cpe_full(elem_fun_t f, long int maxcnt, long int samples, FILE *data_file,
+		     sample_t smethod, double bias, long int verbose)
+{
+    long int i;
+    long int cnt;
+    double cpe;
+    double overhead = 0;
+    double *cnt_val = calloc(samples, sizeof(double));
+    double *cycle_val = calloc(samples, sizeof(double));
+    /* Do the samples */
+
+    srand(SEED);
+    for (i = 0; i < samples; i++) {
+	cnt = get_cnt(i, samples, maxcnt, smethod, bias);
+	cnt_val[i] = cnt;
+	cycle_val[i] = measure_function(f, cnt);
+	if (cycle_val[i] < 1.0) {
+	    fprintf(stderr, "Got %.2f cycles for count %ld\n", cycle_val[i], cnt);
+	}
+    }
+    /* Fit data */
+    cpe = ls_slope(cnt_val, cycle_val, samples);
+    if (data_file)
+	overhead = ls_intercept(cnt_val, cycle_val, samples);
+    if (data_file && verbose > 1) {
+	/* Print x values */
+	fprintf(data_file, "Cnt\t0");
+	for (i = 0; i < samples; i++)
+	    fprintf(data_file, "\t%.0f", cnt_val[i]);
+	fprintf(data_file, "\n");
+	/* Print y values */
+	fprintf(data_file, "Cycs.\t");
+	for (i = 0; i < samples; i++)
+	    fprintf(data_file, "\t%.2f", cycle_val[i]);
+	fprintf(data_file, "\n");
+	/* Print ax*b values */
+	fprintf(data_file, "Interp.\t%.2f", overhead);
+	for (i = 0; i < samples; i++)
+	    fprintf(data_file, "\t%.2f", cpe*cnt_val[i]+overhead);
+	fprintf(data_file, "\n");
+    }
+    if (data_file && verbose) {
+	/* Print results */
+	fprintf(data_file, "cpe\t%.2f\tovhd\t%.2f\tavgerr\t\\%.3f\tmaxerr\t\\%.3f\n",
+		cpe, overhead,
+		ls_error(cnt_val, cycle_val, samples, LS_AVG),
+		ls_error(cnt_val, cycle_val, samples, LS_MAX));
+    }
+    free(cnt_val);
+    free(cycle_val);
+    return cpe;
+}
+
+/* Use default parameters */
+double find_cpe(elem_fun_t f, int maxcnt)
+{
+    return find_cpe_full(f, maxcnt, 100, stdout, RAN_SAMPLE, 0.3, 0);
+}
--- a/perflab/matrix/cpe.h
+++ b/perflab/matrix/cpe.h
@@ -0,0 +1,31 @@
+/* Compute CPE for function */
+
+/* Compute for function that is linear in some parameter cnt */
+typedef void (*elem_fun_t)(int);
+
+/* Different ways of finding samples 
+   UNI_SAMPLE: samples uniformly spaced between bias*maxcnt and maxcnt
+   RAN_SAMPLE: samples randomly selected between bias*maxcnt and maxcnt
+*/
+
+typedef enum {UNI_SAMPLE, RAN_SAMPLE}
+  sample_t;
+
+/* Find cpe for function f, which allows cnt up to maxcnt.
+   Uses default parameters
+*/
+double find_cpe(elem_fun_t f, int maxcnt);
+
+/* Find cpe for function f, which allows cnt up to maxcnt, using
+   specified number of sample points.
+   If data_file, then print data so that can plot points with Excel
+   smethod determines method for generating samples
+*/
+double find_cpe_full(elem_fun_t f, long int maxcnt, long int samples, FILE *data_file,
+		     sample_t smethod, double bias, long int verbose);
+
+/* Find number of cycles taken by function.
+   Do this by running number of trials until best two within TOL (2%) of
+   each other
+*/
+double measure_function(elem_fun_t f, int cnt);
--- a/perflab/matrix/fcyc.c
+++ b/perflab/matrix/fcyc.c
@@ -0,0 +1,223 @@
+/* Compute time used by function f */
+#include <stdlib.h>
+#include <time.h>
+#include <stdio.h>
+
+#include "clock.h"
+#include "fcyc.h"
+
+#define K 3
+#define MAXSAMPLES 20
+#define EPSILON 0.01
+#define COMPENSATE 0
+#define CLEAR_CACHE 0
+#define CACHE_BYTES (1<<19)
+#define CACHE_BLOCK 32
+#define MAX_ITER_TIMES 10
+
+static long int kbest = K;
+static long int compensate = COMPENSATE;
+static long int clear_cache = CLEAR_CACHE;
+static long int maxsamples = MAXSAMPLES;
+static double epsilon = EPSILON;
+static long int cache_bytes = CACHE_BYTES;
+static long int cache_block = CACHE_BLOCK;
+
+static long int *cache_buf = NULL;
+
+static double *values = NULL;
+static long int samplecount = 0;
+
+#define KEEP_VALS 0
+#define KEEP_SAMPLES 0
+
+#if KEEP_SAMPLES
+static double *samples = NULL;
+#endif
+
+/* Start new sampling process */
+static void init_sampler(void)
+{
+    if (values)
+	free(values);
+    values = calloc(kbest, sizeof(double));
+#if KEEP_SAMPLES
+    if (samples)
+	free(samples);
+    /* Allocate extra for wraparound analysis */
+    samples = calloc(maxsamples+kbest, sizeof(double));
+#endif
+    samplecount = 0;
+}
+
+/* Add new sample.  */
+static void add_sample(double val)
+{
+    long int pos = 0;
+    if (samplecount < kbest) {
+	pos = samplecount;
+	values[pos] = val;
+    } else if (val < values[kbest-1]) {
+	pos = kbest-1;
+	values[pos] = val;
+    }
+#if KEEP_SAMPLES
+    samples[samplecount] = val;
+#endif
+    samplecount++;
+    /* Insertion sort */
+    while (pos > 0 && values[pos-1] > values[pos]) {
+	double temp = values[pos-1];
+	values[pos-1] = values[pos];
+	values[pos] = temp;
+	pos--;
+    }
+}
+
+/* Have kbest minimum measurements converged within epsilon? */
+static long int has_converged(void)
+{
+    return
+	(samplecount >= kbest) &&
+	((1 + epsilon)*values[0] >= values[kbest-1]);
+}
+
+/* Code to clear cache */
+
+
+static volatile long int sink = 0;
+
+static void clear(void)
+{
+    long int x = sink;
+    long int *cptr, *cend;
+    long int incr = cache_block/sizeof(long int);
+    if (!cache_buf) {
+	cache_buf = malloc(cache_bytes);
+	if (!cache_buf) {
+	    fprintf(stderr, "Fatal error.  Malloc returned null when trying to clear cache\n");
+	    exit(1);
+	}
+    }
+    cptr = (long int *) cache_buf;
+    cend = cptr + cache_bytes/sizeof(long int);
+    while (cptr < cend) {
+	x += *cptr;
+	cptr += incr;
+    }
+    sink = x;
+}
+
+double fcyc(test_funct f, int *params)
+{
+		int i;
+    double result;
+    init_sampler();
+    if (compensate) {
+	do {
+	    double cyc;
+	    if (clear_cache)
+		clear();
+	    start_counter();
+	    f(params);
+	    cyc = get_counter();
+	    if (cyc > 0.0)
+		add_sample(cyc);
+	} while (!has_converged() && samplecount < maxsamples);
+    } else {
+	do {
+	    double cyc;
+	    if (clear_cache)
+		clear();
+	    start_counter();
+	    for (i=0;i<MAX_ITER_TIMES;i++)
+    		f(params);
+	    cyc = get_counter()/MAX_ITER_TIMES;
+	    if (cyc > 0.0)
+		add_sample(cyc);
+
+	} while (!has_converged() && samplecount < maxsamples);
+    }
+#ifdef DEBUG
+    {
+	long int i;
+	printf(" %ld smallest values: [", kbest);
+	for (i = 0; i < kbest; i++)
+	    printf("%.0f%s", values[i], i==kbest-1 ? "]\n" : ", ");
+    }
+#endif
+    result = values[0];
+#if !KEEP_VALS
+    free(values);
+    values = NULL;
+#endif
+    return result;
+}
+
+
+/***********************************************************/
+/* Set the various parameters used by measurement routines */
+
+
+/* When set, will run code to clear cache before each measurement
+   Default = 0
+*/
+void set_fcyc_clear_cache(long int clear)
+{
+    clear_cache = clear;
+}
+
+/* Set size of cache to use when clearing cache
+   Default = 1<<19 (512KB)
+*/
+void set_fcyc_cache_size(long int bytes)
+{
+    if (bytes != cache_bytes) {
+	cache_bytes = bytes;
+	if (cache_buf) {
+	    free(cache_buf);
+	    cache_buf = NULL;
+	}
+    }
+}
+
+/* Set size of cache block
+   Default = 32
+*/
+void set_fcyc_cache_block(long int bytes) {
+    cache_block = bytes;
+}
+
+
+/* When set, will attempt to compensate for timer interrupt overhead
+   Default = 0
+*/
+void set_fcyc_compensate(long int compensate_arg)
+{
+    compensate = compensate_arg;
+}
+
+/* Value of K in K-best
+   Default = 3
+*/
+void set_fcyc_k(long int k)
+{
+    kbest = k;
+}
+
+/* Maximum number of samples attempting to find K-best within some tolerance.
+   When exceeded, just return best sample found.
+   Default = 20
+*/
+void set_fcyc_maxsamples(long int maxsamples_arg)
+{
+    maxsamples = maxsamples_arg;
+}
+
+/* Tolerance required for K-best
+   Default = 0.01
+*/
+void set_fcyc_epsilon(double epsilon_arg)
+{
+    epsilon = epsilon_arg;
+}
--- a/perflab/matrix/fcyc.h
+++ b/perflab/matrix/fcyc.h
@@ -0,0 +1,52 @@
+
+/* Fcyc measures the speed of any "test function."  Such a function
+   is passed a list of integer parameters, which it may interpret
+   in any way it chooses.
+*/
+
+typedef void (*test_funct)(long int *);
+
+/* Compute number of cycles used by function f on given set of parameters */
+double fcyc(test_funct f, int* params);
+
+/***********************************************************/
+/* Set the various parameters used by measurement routines */
+
+
+/* When set, will run code to clear cache before each measurement
+   Default = 0
+*/
+void set_fcyc_clear_cache(long int clear);
+
+/* Set size of cache to use when clearing cache
+   Default = 1<<19 (512KB)
+*/
+void set_fcyc_cache_size(long int bytes);
+
+/* Set size of cache block
+   Default = 32
+*/
+void set_fcyc_cache_block(long int bytes);
+
+/* When set, will attempt to compensate for timer interrupt overhead
+   Default = 0
+*/
+void set_fcyc_compensate(long int compensate);
+
+/* Value of K in K-best
+   Default = 3
+*/
+void set_fcyc_k(long int k);
+
+/* Maximum number of samples attempting to find K-best within some tolerance.
+   When exceeded, just return best sample found.
+   Default = 20
+*/
+void set_fcyc_maxsamples(long int maxsamples);
+
+/* Tolerance required for K-best
+   Default = 0.01
+*/
+void set_fcyc_epsilon(double epsilon);
+
+
--- a/perflab/matrix/lsquare.c
+++ b/perflab/matrix/lsquare.c
@@ -0,0 +1,94 @@
+/* Compute least squares fit of set of data points */
+#include <stdio.h>
+#include <stdlib.h>
+#include "lsquare.h"
+
+typedef struct {
+    double sum_x;
+    double sum_y;
+    double sum_xx;
+    double sum_xy;
+} ls_stat_t;
+
+/* Accumulate various sums of the data */
+static void ls_stats(double *xval, double *yval, int cnt, ls_stat_t *statp)
+{
+    int i;
+    statp->sum_x = 0.0;
+    statp->sum_y = 0.0;
+    statp->sum_xx = 0.0;
+    statp->sum_xy = 0.0;
+    for (i = 0; i < cnt; i++) {
+	double x = xval[i];
+	double y = yval[i];
+	statp->sum_x += x;
+	statp->sum_y += y;
+	statp->sum_xx += x * x;
+	statp->sum_xy += x * y;
+    }
+}
+
+double ls_slope(double *xval, double *yval, int cnt)
+{
+    double slope;
+    ls_stat_t stat;
+    ls_stats(xval, yval, cnt, &stat);
+    slope = (cnt * stat.sum_xy - stat.sum_x * stat.sum_y)/
+	(cnt * stat.sum_xx - stat.sum_x*stat.sum_x);
+    return slope;
+}
+
+double ls_intercept(double *xval, double *yval, int cnt)
+{
+    double intercept;
+    ls_stat_t stat;
+    ls_stats(xval, yval, cnt, &stat);
+    intercept = (stat.sum_xx * stat.sum_y - stat.sum_xy * stat.sum_x)/
+	(cnt * stat.sum_xx - stat.sum_x*stat.sum_x);
+    return intercept;
+}
+
+static double rel_err(double x, double y, double slope, double intercept)
+{
+    double pred_y = slope*x + intercept;
+    double offset = y - pred_y;
+    if (offset < 0)
+	offset = -offset;
+    if (pred_y == 0)
+	return offset;
+    return offset/pred_y;
+}
+
+double ls_error(double *xval, double *yval, int cnt, ls_err_t etype)
+{
+    double slope;
+    double intercept;
+    ls_stat_t stat;
+    int i;
+    double num, denom;
+    ls_stats(xval, yval, cnt, &stat);
+    slope = (cnt * stat.sum_xy - stat.sum_x * stat.sum_y)/
+	(cnt * stat.sum_xx - stat.sum_x*stat.sum_x);
+    intercept = (stat.sum_xx * stat.sum_y - stat.sum_xy * stat.sum_x)/
+	(cnt * stat.sum_xx - stat.sum_x*stat.sum_x);
+    num = denom = 0;
+    for (i = 0; i < cnt; i++) {
+	double e = rel_err(xval[i], yval[i], slope, intercept);
+	switch (etype) {
+	case LS_AVG:
+	    num += e;
+	    denom++;
+	    break;
+	case LS_MAX:
+	    if (num < e)
+		num = e;
+	    denom = 1;
+	    break;
+	default:
+	    fprintf(stderr, "Invalid error type: %d\n", etype);
+	    exit(1);
+	    break;
+	}
+    }
+    return num/denom;
+}
--- a/perflab/matrix/lsquare.h
+++ b/perflab/matrix/lsquare.h
@@ -0,0 +1,11 @@
+/* Compute least squares fit of set of data points */
+
+/* Fit is of form y = mx + b.  m is slope, b is intercept */
+double ls_slope(double *xval, double *yval, int cnt);
+double ls_intercept(double *xval, double *yval, int cnt);
+
+typedef enum {LS_AVG, LS_MAX} ls_err_t;
+
+/* Determine error (either absolute or average) of least squares fit */
+double ls_error(double *xval, double *yval, int cnt, ls_err_t etype);
+
--- a/perflab/matrix/matrix/matrix.sln
+++ b/perflab/matrix/matrix/matrix.sln
@@ -0,0 +1,28 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 14
+VisualStudioVersion = 14.0.25420.1
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrix", "matrix.vcxproj", "{15DC376D-CB40-4A27-BCF8-BCE93039E478}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Debug|x86 = Debug|x86
+		Release|x64 = Release|x64
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{15DC376D-CB40-4A27-BCF8-BCE93039E478}.Debug|x64.ActiveCfg = Debug|x64
+		{15DC376D-CB40-4A27-BCF8-BCE93039E478}.Debug|x64.Build.0 = Debug|x64
+		{15DC376D-CB40-4A27-BCF8-BCE93039E478}.Debug|x86.ActiveCfg = Debug|Win32
+		{15DC376D-CB40-4A27-BCF8-BCE93039E478}.Debug|x86.Build.0 = Debug|Win32
+		{15DC376D-CB40-4A27-BCF8-BCE93039E478}.Release|x64.ActiveCfg = Release|x64
+		{15DC376D-CB40-4A27-BCF8-BCE93039E478}.Release|x64.Build.0 = Release|x64
+		{15DC376D-CB40-4A27-BCF8-BCE93039E478}.Release|x86.ActiveCfg = Release|Win32
+		{15DC376D-CB40-4A27-BCF8-BCE93039E478}.Release|x86.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/perflab/matrix/matrix/matrix.vcxproj
+++ b/perflab/matrix/matrix/matrix.vcxproj
@@ -0,0 +1,123 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{15DC376D-CB40-4A27-BCF8-BCE93039E478}</ProjectGuid>
+    <RootNamespace>matrix</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\clock.c" />
+    <ClCompile Include="..\cpe.c" />
+    <ClCompile Include="..\fcyc.c" />
+    <ClCompile Include="..\lsquare.c" />
+    <ClCompile Include="..\rowcol.c" />
+    <ClCompile Include="..\rowcol_test.c" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/perflab/matrix/matrix/matrix.vcxproj.filters
+++ b/perflab/matrix/matrix/matrix.vcxproj.filters
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="源文件">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="头文件">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="资源文件">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\clock.c">
+      <Filter>源文件</Filter>
+    </ClCompile>
+    <ClCompile Include="..\cpe.c">
+      <Filter>源文件</Filter>
+    </ClCompile>
+    <ClCompile Include="..\fcyc.c">
+      <Filter>源文件</Filter>
+    </ClCompile>
+    <ClCompile Include="..\lsquare.c">
+      <Filter>源文件</Filter>
+    </ClCompile>
+    <ClCompile Include="..\rowcol.c">
+      <Filter>源文件</Filter>
+    </ClCompile>
+    <ClCompile Include="..\rowcol_test.c">
+      <Filter>源文件</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
--- a/perflab/matrix/rowcol.c
+++ b/perflab/matrix/rowcol.c
@@ -0,0 +1,77 @@
+/**************************************************************************
+	行/列求和函数。按下面的要求编辑此文件：
+	1. 将你的学号、姓名，以注释的方式写到下面；
+	2. 实现不同版本的行列求和函数；
+	3. 编辑rc_fun_rec rc_fun_tab数组，将你的最好的答案
+		（最好的行和列求和、最好的列求和）作为数组的前两项
+***************************************************************************/
+   
+/*
+	学号：201209054233
+	姓名：夜半加班狂
+*/
+
+
+#include  <stdio.h>
+#include  <stdlib.h>
+#include  "rowcol.h"
+#include  <math.h>
+
+/* 参考的列求和函数实现 */
+/* 计算矩阵中的每一列的和。请注意对于行和列求和来说，调用参数是
+	一样的，只是第2个参数不会用到而已
+*/
+
+void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+{
+    int i,j;
+    for (j = 0; j < N; j++) {
+	colsum[j] = 0;
+	for (i = 0; i < N; i++)
+	    colsum[j] += M[i][j];
+    }
+}
+
+
+/* 参考的列和行求和函数实现 */
+/* 计算矩阵中的每一行、每一列的和。 */
+
+void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+{
+    int i,j;
+    for (i = 0; i < N; i++) {
+	rowsum[i] = colsum[i] = 0;
+	for (j = 0; j < N; j++) {
+	    rowsum[i] += M[i][j];
+	    colsum[i] += M[j][i];
+	}
+    }
+}
+
+
+
+/* 
+	这个表格包含多个数组元素，每一组元素（函数名字, COL/ROWCOL, "描述字符串"）
+	COL表示该函数仅仅计算每一列的和
+	ROWCOL表示该函数计算每一行、每一列的和
+	将你认为最好的两个实现，放在最前面。
+	比如：
+	{my_c_sum1, "超级垃圾列求和实现"},
+	{my_rc_sum2, "好一点的行列求和实现"},
+*/
+
+rc_fun_rec rc_fun_tab[] = 
+{
+
+  /* 第一项，应当是你写的最好列求和的函数实现 */
+    {c_sum, COL, "Best column sum"},
+  /* 第二项，应当是你写的最好行列求和的函数实现 */
+    {rc_sum, ROWCOL, "Best row and column sum"},
+
+    {c_sum, COL, "Column sum, reference implementation"},
+
+    {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
+
+ /* 下面的代码不能修改或者删除！！表明数组列表结束 */
+    {NULL,ROWCOL,NULL}
+};
--- a/perflab/matrix/rowcol.h
+++ b/perflab/matrix/rowcol.h
@@ -0,0 +1,35 @@
+/* Matrix row and/or column summation code */
+
+/* Size of matrices */
+/* $begin rcdecl */
+#define N 512
+/* $end rcdecl */
+
+/* Data types */
+
+/* Pointer type for vectors */
+typedef int *vecp_t;
+/* $begin rcdecl */
+/* N x N matrix */
+typedef int matrix_t[N][N];
+
+/* Vector of length N */
+typedef int vector_t[N];
+/* $end rcdecl */
+
+/* Different sum/product function types */
+typedef enum { COL, ROWCOL } rc_comp_t;
+
+/* Summation function */
+typedef void (*rc_fun)(matrix_t, vector_t, vector_t);
+
+typedef struct {
+    rc_fun f;
+    rc_comp_t rc_type; /* What computation does it perform? */
+    char *descr;
+} rc_fun_rec, *rc_fun_ptr;
+
+/* Table of functions to test.  Null terminated */
+extern rc_fun_rec rc_fun_tab[];
+
+
--- a/perflab/matrix/rowcol_test.c
+++ b/perflab/matrix/rowcol_test.c
@@ -0,0 +1,173 @@
+#include <stdio.h>
+#include <stdlib.h>
+//#include <random.h>
+#include "rowcol.h"
+#include "fcyc.h"
+#include "clock.h"
+
+#define MAX_ITER_COUNT 100
+
+/* Define performance standards */
+static struct {
+  double cref;  /* Cycles taken by reference solution */
+  double cbest; /* Cycles taken by our best implementation */
+} cstandard[2] = 
+{{7.7, 6.40}, /* Column Sum */
+ {9.75, 6.60} /* Row & Column Sum */
+};
+
+/* Put in code to align matrix so that it starts on a cache block boundary.
+   This makes the cache performance of the code a bit more predictable
+*/
+
+/* Words per cache block.  OK if this is an estimate as long as it
+   is a multiple of the actual value
+*/
+#define WPB 16
+
+int verbose = 1;
+int data[N*N+WPB];
+int *mstart;
+
+typedef vector_t *row_t;
+
+/* Reference row and column sums */
+vector_t rsref, csref, rcomp, ccomp;
+
+static void init_tests(void);
+extern void make_CPU_busy(void);
+
+static void init_tests(void)
+{
+    int i, j;
+    size_t bytes_per_block = sizeof(int) * WPB;
+    /* round mstart up to nearest block boundary */
+    mstart = (int *)
+      (((size_t) data + bytes_per_block-1) / bytes_per_block * bytes_per_block);
+    for (i = 0; i < N; i++) {
+	rsref[i] = csref[i] = 0;
+    }
+    for (i = 0; i < N; i++) {
+	for (j = 0; j < N; j++) {
+	    int val = rand();
+	    mstart[i*N+j] = val;
+	    rsref[i] += val;
+	    csref[j] += val;
+	}
+    }
+}
+
+
+/* Test function on all values */
+int test_rc(rc_fun f, FILE *rpt, rc_comp_t rc_type) {
+    int i;
+    int ok = 1;
+
+    for (i = 0; i < N; i++)
+	rcomp[i] = ccomp[i] = 0xDEADBEEF;
+    f((row_t)mstart, rcomp, ccomp);
+
+    for (i = 0; ok && i < N; i++) {
+	if (rc_type == ROWCOL
+	    && rsref[i] != rcomp[i]) {
+	    ok = 0;
+	    if (rpt)
+		fprintf(rpt,
+			"对第%d行的计算出错！正确结果是%d，但是计算得到%d\n",
+			i, rsref[i], rcomp[i]);
+	}
+	if ((rc_type == ROWCOL || rc_type == COL)
+		 && csref[i] != ccomp[i]) {
+	    ok = 0;
+	    if (rpt)
+		fprintf(rpt,
+			"对第%d列的计算出错！正确结果是%d，但是计算得到%d\n",
+			i, csref[i], ccomp[i]);
+	}
+
+    }
+    return ok;
+}
+
+/* Kludgy way to interface to cycle measuring code */
+void do_test(int *intf)
+{
+  rc_fun f = (rc_fun) intf;
+  f((row_t)mstart, rcomp, ccomp);
+}
+
+void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp)
+{
+	int i;
+  int *intf = (int *) f;
+  double t, cme;
+  t = 0;
+  if (verbose) printf("函数：%s\n", descr);
+  if (test_rc(f, stdout, rc_type)) {
+  	make_CPU_busy();
+  	for (i=0;i<MAX_ITER_COUNT;i++)
+    	t += fcyc(do_test, intf);
+    t = t/MAX_ITER_COUNT;
+    cme = t/(N*N);
+    if (verbose) printf("  总周期数 = %.2f, 平均周期/元素 = %.2f\n",
+	   t, cme);
+    if (cycp)
+      *cycp = cme;
+  }
+}
+
+/* Compute the grade achieved by function */
+static double compute_score(double cmeas, double cref, double cbest)
+{
+  double sbest = cref/cbest;
+  double smeas = cref/cmeas;
+  if (smeas < 0.1*(sbest-1)+1)
+    return 0;
+  if (smeas > 1.1*(sbest-1)+1)
+    return 120;
+  return 100*((smeas-1.0)/(sbest-1.0) + 0.1);
+}
+
+int main(int argc, char *argv[])
+{
+  int i;
+  double cme;
+  double cme_c,cme_rc;
+  int EnableScore=0;
+  
+  if (argc == 3)
+  {
+  	EnableScore = 1;
+  	verbose = 0;
+  }
+  init_tests();
+  set_fcyc_clear_cache(1);  /* Set so that clears cache between runs */
+  for (i = 0; rc_fun_tab[i].f != NULL; i++) {
+      cme = 100.0;
+      time_rc(rc_fun_tab[i].f,
+	    rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme);
+    if (i == 0)
+    {
+    	cme_c = cme;
+    	if (EnableScore==0)
+    	{
+      printf("  最高\"列求和\"得分   ======================== %.0f\n",
+	     compute_score(cme, cstandard[0].cref, cstandard[0].cbest));
+	    }
+	  }
+    if (i == 1)
+    {
+    	cme_rc = cme;
+    	if (EnableScore==0)
+    	{
+      printf("  最高\"行和列求和\"得分 ====================== %.0f\n",
+	     compute_score(cme, cstandard[1].cref, cstandard[1].cbest));
+	    }
+	  }
+  }
+  
+  if (EnableScore)
+  	printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n",cme_c,compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest),
+  	cme_rc,compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest));
+  return 0;
+}