from m5.objects import *
import m5
import sys
args = sys.argv[1:]
if "--" in args:
args = args[args.index("--") + 1:]
if len(args) == 0:
print("Usage: build/RISCV/gem5.opt script.py -- <binary> [args...]")
exit(1)
binary = args[0]
binary_args = args[1:]
print(f"Binary: {binary}")
print(f"Arguments: {binary_args}")
system = System()
system.clk_domain = SrcClockDomain()
system.clk_domain.clock = '2GHz'
system.clk_domain.voltage_domain = VoltageDomain()
system.mem_mode = 'timing'
system.mem_ranges = [AddrRange('16GB')]
system.cpu = TimingSimpleCPU()
system.membus = SystemXBar()
class L1Cache(Cache):
assoc = 4
tag_latency = 2
data_latency = 2
response_latency = 2
mshrs = 16
tgts_per_mshr = 20
system.cpu.icache = L1Cache(size='64kB')
system.cpu.icache.cpu_side = system.cpu.icache_port
system.cpu.icache.mem_side = system.membus.cpu_side_ports
system.cpu.dcache = L1Cache(size='64kB')
system.cpu.dcache.cpu_side = system.cpu.dcache_port
system.cpu.dcache.mem_side = system.membus.cpu_side_ports
system.cpu.createInterruptController()
system.mem_ctrl = MemCtrl()
system.mem_ctrl.dram = DDR4_2400_8x8()
system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.mem_side_ports
process = Process()
process.cmd = [binary] + binary_args
system.workload = SEWorkload.init_compatible(binary)
system.cpu.workload = process
system.cpu.createThreads()
system.system_port = system.membus.cpu_side_ports
root = Root(full_system=False, system=system)
m5.instantiate()
print("Starting simulation...")
exit_event = m5.simulate()
print(f"Exiting @ tick {m5.curTick()} because {exit_event.getCause()}")
from m5.objects import *
import m5
import sys
args = sys.argv[1:]
if "--" in args:
args = args[args.index("--") + 1:]
if len(args) == 0:
print("Usage: build/RISCV/gem5.opt multilevel_cache.py -- <binary> [args...]")
exit(1)
binary = args[0]
binary_args = args[1:]
print(f"Binary: {binary}")
print(f"Arguments: {binary_args}")
system = System()
system.clk_domain = SrcClockDomain()
system.clk_domain.clock = '2GHz'
system.clk_domain.voltage_domain = VoltageDomain()
system.mem_mode = 'timing'
system.mem_ranges = [AddrRange('16GB')]
system.cpu = TimingSimpleCPU()
system.l2bus = L2XBar()
system.l3bus = L2XBar()
system.membus = SystemXBar()
class L1Cache(Cache):
assoc = 4
tag_latency = 2
data_latency = 2
response_latency = 2
mshrs = 16
tgts_per_mshr = 20
class L2Cache(Cache):
assoc = 8
tag_latency = 8
data_latency = 8
response_latency = 8
mshrs = 32
tgts_per_mshr = 20
class L3Cache(Cache):
assoc = 16
tag_latency = 20
data_latency = 20
response_latency = 20
mshrs = 64
tgts_per_mshr = 20
system.cpu.icache = L1Cache(size='64kB')
system.cpu.dcache = L1Cache(size='64kB')
system.cpu.icache.cpu_side = system.cpu.icache_port
system.cpu.dcache.cpu_side = system.cpu.dcache_port
system.cpu.icache.mem_side = system.l2bus.cpu_side_ports
system.cpu.dcache.mem_side = system.l2bus.cpu_side_ports
system.l2cache = L2Cache(size='512kB')
system.l2cache.cpu_side = system.l2bus.mem_side_ports
system.l2cache.mem_side = system.l3bus.cpu_side_ports
system.l3cache = L3Cache(size='2MB')
system.l3cache.cpu_side = system.l3bus.mem_side_ports
system.l3cache.mem_side = system.membus.cpu_side_ports
system.cpu.createInterruptController()
system.mem_ctrl = MemCtrl()
system.mem_ctrl.dram = DDR4_2400_8x8()
system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.mem_side_ports
process = Process()
process.cmd = [binary] + binary_args
system.workload = SEWorkload.init_compatible(binary)
system.cpu.workload = process
system.cpu.createThreads()
system.system_port = system.membus.cpu_side_ports
root = Root(full_system=False, system=system)
m5.instantiate()
print("Starting simulation (Multi-Level Cache)...")
exit_event = m5.simulate()
print(f"Exiting @ tick {m5.curTick()} because {exit_event.getCause()}")
7 access patterns: sequential, hot loop, strided, matrix multiply, memory copy, random access, linked list traversal.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MATRIX_SIZE 128
#define ARRAY_SIZE 65536
#define RANDOM_ITERS 50000
#define STRIDE 16
#define LIST_SIZE 8192
typedef struct Node { int value; struct Node* next; } Node;
static int A[MATRIX_SIZE][MATRIX_SIZE];
static int B[MATRIX_SIZE][MATRIX_SIZE];
static int C[MATRIX_SIZE][MATRIX_SIZE];
static int arr[ARRAY_SIZE];
static int result_arr[ARRAY_SIZE];
static unsigned int seed = 12345;
static unsigned int pseudo_rand() {
seed ^= seed << 13; seed ^= seed >> 17; seed ^= seed << 5;
return seed;
}
void test_matrix_multiply() {
int i, j, k;
for (i = 0; i < MATRIX_SIZE; i++)
for (j = 0; j < MATRIX_SIZE; j++) {
A[i][j] = (i * MATRIX_SIZE + j) % 100;
B[i][j] = (i + j) % 100; C[i][j] = 0;
}
for (i = 0; i < MATRIX_SIZE; i++)
for (k = 0; k < MATRIX_SIZE; k++)
for (j = 0; j < MATRIX_SIZE; j++)
C[i][j] += A[i][k] * B[k][j];
volatile int sink = C[MATRIX_SIZE/2][MATRIX_SIZE/2]; (void)sink;
printf("[1] Matrix Multiply done. C[64][64] = %d\n", C[MATRIX_SIZE/2][MATRIX_SIZE/2]);
}
void test_sequential_access() {
int i; long sum = 0;
for (i = 0; i < ARRAY_SIZE; i++) arr[i] = i % 1000;
for (i = 0; i < ARRAY_SIZE; i++) sum += arr[i];
for (i = 0; i < ARRAY_SIZE; i++) result_arr[i] = arr[i] * 2;
printf("[2] Sequential Access done. Sum = %ld\n", sum);
}
void test_random_access() {
int i; long sum = 0; unsigned int idx;
for (i = 0; i < ARRAY_SIZE; i++) arr[i] = i;
for (i = 0; i < RANDOM_ITERS; i++) { idx = pseudo_rand() % ARRAY_SIZE; sum += arr[idx]; }
for (i = 0; i < RANDOM_ITERS; i++) { idx = pseudo_rand() % ARRAY_SIZE; arr[idx] = (int)(sum % 1000); }
printf("[3] Random Access done. Sum = %ld\n", sum);
}
void test_strided_access() {
int i; long sum = 0;
for (i = 0; i < ARRAY_SIZE; i++) arr[i] = i % 500;
for (i = 0; i < ARRAY_SIZE; i += STRIDE) sum += arr[i];
for (i = 0; i < ARRAY_SIZE; i += STRIDE) arr[i] = (int)(sum % 1000);
printf("[4] Strided Access done (stride=%d). Sum = %ld\n", STRIDE, sum);
}
void test_linked_list() {
int i; long sum = 0;
Node* nodes = (Node*)malloc(LIST_SIZE * sizeof(Node));
if (!nodes) { printf("[5] malloc failed\n"); return; }
for (i = 0; i < LIST_SIZE - 1; i++) { nodes[i].value = i; nodes[i].next = &nodes[i+1]; }
nodes[LIST_SIZE-1].value = LIST_SIZE-1; nodes[LIST_SIZE-1].next = NULL;
for (i = LIST_SIZE-1; i > 0; i--) {
unsigned int j = pseudo_rand() % (i+1);
Node* tmp = nodes[i].next; nodes[i].next = nodes[j].next; nodes[j].next = tmp;
}
Node* cur = &nodes[0];
while (cur != NULL) { sum += cur->value; cur = cur->next; }
printf("[5] Linked List Traversal done. Sum = %ld\n", sum);
free(nodes);
}
void test_hot_loop() {
int i, j; long sum = 0; int hot[512];
for (i = 0; i < 512; i++) hot[i] = i;
for (j = 0; j < 200; j++) for (i = 0; i < 512; i++) sum += hot[i];
printf("[6] Hot Loop done. Sum = %ld\n", sum);
}
void test_memory_copy() {
int i;
for (i = 0; i < ARRAY_SIZE; i++) arr[i] = i % 256;
for (i = 0; i < ARRAY_SIZE; i++) result_arr[i] = arr[i];
int ok = 1;
for (i = 0; i < 100; i++) if (result_arr[i] != arr[i]) { ok = 0; break; }
printf("[7] Memory Copy done. Verify: %s\n", ok ? "PASS" : "FAIL");
}
int main() {
printf("==============================================\n");
printf(" COA Cache Benchmark\n");
printf("==============================================\n\n");
printf("--- Starting Benchmark Suite ---\n\n");
test_sequential_access();
test_hot_loop();
test_strided_access();
test_matrix_multiply();
test_memory_copy();
test_random_access();
test_linked_list();
printf("\n--- Benchmark Complete ---\n");
printf("Check m5out/stats.txt for cache metrics.\n");
return 0;
}
Compile benchmark for RISC-V:
~/riscv/bin/riscv64-unknown-elf-gcc -static benchmark.c -o benchmark
Run single-level cache simulation:
./build/RISCV/gem5.opt single_level_cache.py -- ./benchmark mv m5out/stats.txt m5out/stats_single.txt
Run multi-level cache simulation:
./build/RISCV/gem5.opt multi_level_cache.py -- ./benchmark mv m5out/stats.txt m5out/stats_multi.txt
Extract metrics:
cat m5out/stats_single.txt | grep -E "missRate|numCycles|overallAvgMissLatency" cat m5out/stats_multi.txt | grep -E "missRate|numCycles|overallAvgMissLatency"
$ ./build/RISCV/gem5.opt single_level_cache.py -- ./benchmark warn: Base 10 memory/cache sizes cast to base 2 (16GiB, 64KiB x2) info: RVV enabled, VLEN = 256 bits, ELEN = 64 bits Starting simulation... [2] Sequential Access done. Sum = 32610880 [6] Hot Loop done. Sum = 26163200 [4] Strided Access done (stride=16). Sum = 1015460 [1] Matrix Multiply done. C[64][64] = 295312 [7] Memory Copy done. Verify: PASS [3] Random Access done. Sum = 1638151029 [5] Linked List Traversal done. Sum = 9560438 --- Benchmark Complete --- Exiting @ tick 179555699000 because exiting with last active thread context
$ ./build/RISCV/gem5.opt multi_level_cache.py -- ./benchmark warn: Base 10 memory/cache sizes cast to base 2 (16GiB, 64KiB, 512KiB, 2MiB) info: RVV enabled, VLEN = 256 bits, ELEN = 64 bits Starting simulation (Multi-Level Cache)... [2] Sequential Access done. Sum = 32610880 [6] Hot Loop done. Sum = 26163200 [4] Strided Access done (stride=16). Sum = 1015460 [1] Matrix Multiply done. C[64][64] = 295312 [7] Memory Copy done. Verify: PASS [3] Random Access done. Sum = 1638151029 [5] Linked List Traversal done. Sum = 9560438 --- Benchmark Complete --- Exiting @ tick 172574311500 because exiting with last active thread context
| Configuration | Cycles | Improvement |
|---|---|---|
| Single Level (L1 only) | 359,111,398 | baseline |
| Multi Level (L1+L2+L3) | 345,148,623 | ▼ 3.89% faster |
| Cache Level | Metric | Single Level | Multi Level |
|---|---|---|---|
| L1 I-Cache | Overall Miss Rate | 0.0001% | 0.0001% |
| L1 D-Cache | Overall Miss Rate | 0.41% | 0.41% |
| L1 D-Cache | Read Miss Rate | 0.22% | 0.22% |
| L1 D-Cache | Write Miss Rate | 1.31% | 1.31% |
| L2 Cache | Overall Miss Rate | N/A | 14.26% |
| L2 Cache | Hit Rate (caught L1 misses) | N/A | 85.74% |
| L3 Cache | Overall Miss Rate | N/A | 61.87% |
| L3 Cache | Hit Rate (caught L2 misses) | N/A | 38.13% |
| Cache | Metric | Single Level (ticks) | Multi Level (ticks) | Reduction |
|---|---|---|---|---|
| L1 D-Cache | Avg Miss Latency | 59,598 | 14,144 | ▼ 76.27% |
| L1 D-Cache | Read Miss Latency | 59,704 | 7,035 | ▼ 88.22% |
| L1 D-Cache | Write Miss Latency | 59,508 | 20,237 | ▼ 66.00% |
| L2 Cache | Avg Miss Latency | N/A | 55,139 | — |
| L3 Cache | Avg Miss Latency | N/A | 68,105 | — |
| Configuration | Calculation | AMAT |
|---|---|---|
| Single Level | 2 + (0.004051 × 59,598) | ~243.5 cycles |
| Multi Level | 2 + (0.004051 × 14,144) | ~59.3 cycles |