CSAPP CacheLab

Part A

这一部分需要实现一个lru替换策略的cache,然后读取一系列操作内存的指令进行测试。最终将命中、不命中、替换的次数输出,验证正确性。

整个cache模拟的过程为:读取命令行,初始化cache和time数据结构,逐条读取指令,搜索(替换)cache行、更新时间。

#include "cachelab.h"
#include "stdio.h"
#include "stdlib.h"
#include "unistd.h"
#include "getopt.h" #define m 64typedef struct Line {int valid;unsigned long long tag;/* ...Data... */
}cacheline;int hit_count = 0;
int miss_count = 0;
int eviction_count = 0;int display = 0;
unsigned int s, E, b;
char *tf = NULL;void GetCmd(int argc, char **argv) {int i;while ((i = getopt(argc, argv, "hvs:E:b:t:")) != -1) {switch (i){case 'h': {/* ... */ break;}case 'v': {display = 1;break;} case 's': {s = atoi(optarg);break;}case 'E': {E = atoi(optarg);break;}case 'b': {b = atoi(optarg);break;}case 't': {tf = optarg;break;}default: {break;}}}
}cacheline *InitCache() {unsigned int size = (1 << s) * E;cacheline *cache = (cacheline *)malloc(sizeof(cacheline) * size);for (int i = 0; i < size; ++i) {cache[i].valid = 0;}return cache;
}int* InitTimeTable() {unsigned int size = (1 << s) * E;int *time_table = (int *)malloc(sizeof(int) * size);for (int i = 0; i < size; ++i) {time_table[i] = 0;}return time_table;
}//操作单条指令
void Operate(cacheline *cache, int *time_table, char op, unsigned long long addr, int size) {if (op == 'I') {return;}unsigned long long tag = addr >> (s + b);int i = ((addr << (m - s - b)) >> (m - s)) * E; // 第i组int missed = 1, eviction = 1; // eviction = 0表示cache中仍有空间,不需要替换int pos = i; // 在搜索的过程中记录可能会被替换的行/* Search */for (int j = i; j < i + E; ++j) {if (cache[j].valid == 0) {pos = j;eviction = 0;} else if (cache[j].tag == tag) {missed = 0;eviction = 0;pos = j;} else if (eviction == 1 && time_table[j] > time_table[pos]) {pos = j;}}/* Update */if (missed == 1) {cache[pos].valid = 1;cache[pos].tag = tag;}for (int j = i; j < i + E; ++j) {if (j == pos) {time_table[j] = 0;} else {++time_table[j];}}/* Display */if (display) {printf("%c %llx %d", op, addr, size);if (missed) {printf(" miss");if (eviction) {printf(" eviction");}} else {printf(" hit");}if (op == 'M') {printf(" hit");}printf("\n");}/* Result */hit_count += (!missed) + (op == 'M');miss_count += missed;eviction_count += eviction;
}// 逐条读取指令模拟
void Sim(void *cache, int *time_table) {FILE *f = fopen(tf, "r");char op;unsigned long long addr;int size;while (fscanf(f," %c %llx,%d",&op,&addr,&size) != -1) {Operate(cache, time_table, op, addr, size);}fclose(f);
}int main(int argc, char **argv) {GetCmd(argc, argv);cacheline *cache = InitCache();int *time_table = InitTimeTable();Sim(cache, time_table);free(cache);free(time_table);printSummary(hit_count, miss_count, eviction_count);return 0;
}

测试如下:

Part B

这一部分是需要对一个矩阵转置程序进行优化,使程序执行的miss次数尽可能得少,共有三种规格的矩阵需要进行优化。

一个普通的矩阵转置程序如下所示:

void trans(int M, int N, int A[N][M], int B[M][N])
{int i, j, tmp;for (i = 0; i < N; i++) {for (j = 0; j < M; j++) {tmp = A[i][j];B[j][i] = tmp;}}    
}

这里的cache参数为s = 5,E = 1,b = 5,即一共32组,每组1行,每行32字节(8个int型数据),可以写个程序输出每个A[i][j]对应cache 的哪一行:

void print(int M, int N) {int tmp;for (int i = 0; i < N; ++i) {for (int j = 0; j < M; ++j) {tmp = ((i * M + j) / 8) % 32;printf("%3d ", tmp);}printf("\n\n");}
}

这一部分的编程限制为,不能使用递归函数,最多只能定义12个局部变量(32位int型)。

32 * 32

对这个矩阵,以左上8 * 8的一部分为例,每个数据对应的cache行号如下图:

00000000
44444444
88888888
1212121212121212
1616161616161616
2020202020202020
2424242424242424
2828282828282828

如果一行一行的读数,A矩阵虽然能满足一定的空间局部性,但是B矩阵是按列写的,在第一列的前8个数据填入后,第9个数据将再次来到0行cache,将把第1个数据所在的0行cache替换,第10个数据将把第2个数据所在的4行cache替换...所以会发生很多次miss。

所以可以把矩阵分块,这里分为8 * 8的块,逐块进行转置。同时可以一次性读出8个数据,再一次性写入8个数据,进一步减少miss的次数。

char transpose_submit_32_32_desc[] = "32 * 32";
void transpose_submit_32_32(int M, int N, int A[N][M], int B[M][N]) {int t0, t1, t2, t3, t4, t5, t6, t7;for (int i = 0; i < N; i += 8) {for (int j = 0; j < M; j += 8) {for (int k = i; k < i + 8; ++k) {t0 = A[k][j];t1 = A[k][j + 1];t2 = A[k][j + 2];t3 = A[k][j + 3];t4 = A[k][j + 4];t5 = A[k][j + 5];t6 = A[k][j + 6];t7 = A[k][j + 7];B[j][k] = t0;B[j + 1][k] = t1;B[j + 2][k] = t2;B[j + 3][k] = t3;B[j + 4][k] = t4;B[j + 5][k] = t5;B[j + 6][k] = t6;B[j + 7][k] = t7;}}}
}

 miss次数为287。

 

64 * 64

该矩阵与32 * 32矩阵有所不同,其左上角的8 * 8部分的cache行号如下:

00000000
88888888
1616161616161616
2424242424242424
00000000
88888888
1616161616161616
2424242424242424

如果还是按照一次按行读8个数据,那么写的时候由于一列中,每4个数据的行号就会重复,所以这8个数据中的后4个所在的行会把前面的行替换掉。

如果按照一次按行读4个数据,虽然有一定的优化,但是miss数还是达不到满分。

这里用到的办法是仍然一次读取8个数据,只不过将其中的前4个写到正确的位置,后4个暂时存储在B矩阵中,稍后再转移过去。这里的存储位置需要是一个较长时间内不会被使用的位置,且没有行号冲突,所以实现时存储在了B矩阵每一行的末尾处。

char transpose_submit_64_64_desc[] = "64 * 64";
void transpose_submit_64_64(int M, int N, int A[N][M], int B[M][N]) {int t0, t1, t2, t3, t4, t5, t6, t7;for (int j = 0; j < M; j += 8) {for (int i = 0; i < N; i += 8) {if (i != 56) {for (int k = i; k < i + 8; ++k) {t0 = A[k][j];t1 = A[k][j + 1];t2 = A[k][j + 2];t3 = A[k][j + 3];t4 = A[k][j + 4];t5 = A[k][j + 5];t6 = A[k][j + 6];t7 = A[k][j + 7];B[j][k] = t0;B[j + 1][k] = t1;B[j + 2][k] = t2;B[j + 3][k] = t3;B[j][56 + (k - i)] = t4;B[j + 1][56 + (k - i)] = t5;B[j + 2][56 + (k - i)] = t6;B[j + 3][56 + (k - i)] = t7;}for (int k = j; k < j + 4; ++k) {t0 = B[k][56];t1 = B[k][57];t2 = B[k][58];t3 = B[k][59];t4 = B[k][60];t5 = B[k][61];t6 = B[k][62];t7 = B[k][63];B[j + 4 + (k - j)][i] = t0;B[j + 4 + (k - j)][i + 1] = t1;B[j + 4 + (k - j)][i + 2] = t2;B[j + 4 + (k - j)][i + 3] = t3;B[j + 4 + (k - j)][i + 4] = t4;B[j + 4 + (k - j)][i + 5] = t5;B[j + 4 + (k - j)][i + 6] = t6;B[j + 4 + (k - j)][i + 7] = t7;}} else {for (int k = i; k < i + 8; k += 2) {t0 = A[k][j];t1 = A[k][j + 1];t2 = A[k][j + 2];t3 = A[k][j + 3];t4 = A[k + 1][j];t5 = A[k + 1][j + 1];t6 = A[k + 1][j + 2];t7 = A[k + 1][j + 3];B[j][k] = t0;B[j + 1][k] = t1;B[j + 2][k] = t2;B[j + 3][k] = t3;B[j][k + 1] = t4;B[j + 1][k + 1] = t5;B[j + 2][k + 1] = t6;B[j + 3][k + 1] = t7;}for (int k = i; k < i + 8; k += 2) {t0 = A[k][j + 4];t1 = A[k][j + 5];t2 = A[k][j + 6];t3 = A[k][j + 7];t4 = A[k + 1][j + 4];t5 = A[k + 1][j + 5];t6 = A[k + 1][j + 6];t7 = A[k + 1][j + 7];B[j + 4][k] = t0;B[j + 5][k] = t1;B[j + 6][k] = t2;B[j + 7][k] = t3;B[j + 4][k + 1] = t4;B[j + 5][k + 1] = t5;B[j + 6][k + 1] = t6;B[j + 7][k + 1] = t7;}}}}
}

 miss数为1209。

61 * 67

这里的行号分布不再有规律性,尝试了几次后发现每次读取9个数据的效果最好,代码也很简单:

char transpose_submit_61_67_desc[] = "61 * 67";
void transpose_submit_61_67(int M, int N, int A[N][M], int B[M][N]) {int t0, t1, t2, t3, t4, t5, t6, t7, t8;for (int j = 0; j < M; j += 9) {for (int i = 0; i < N; ++i) {t0 = A[i][j];t1 = A[i][j + 1];t2 = A[i][j + 2];t3 = A[i][j + 3];t4 = A[i][j + 4];t5 = A[i][j + 5];t6 = A[i][j + 6];if (j + 7 < M) {t7 = A[i][j + 7];t8 = A[i][j + 8];}B[j][i] = t0;B[j + 1][i] = t1;B[j + 2][i] = t2;B[j + 3][i] = t3;B[j + 4][i] = t4;B[j + 5][i] = t5;B[j + 6][i] = t6;if (j + 7 < M) {B[j + 7][i] = t7;B[j + 8][i] = t8;}}}
}

miss数为1701


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部