PipelineX: Added versions with register fwding alone, static branch prediction,

return address stack.
This commit is contained in:
Bruno Levy
2022-09-08 23:45:33 +02:00
parent e9401388c5
commit 5ee0184d07
8 changed files with 1313 additions and 30 deletions

View File

@@ -1,7 +1,7 @@
PROJECTNAME=SOC
BOARD=ecp5_evn
BOARD_FREQ=12
CPU_FREQ=100
CPU_FREQ=120
FPGA_VARIANT=um5g-85k
FPGA_PACKAGE=CABGA381
VERILOGS=$1

View File

@@ -1,7 +1,7 @@
PROJECTNAME=SOC
BOARD=ulx3s
BOARD_FREQ=25
CPU_FREQ=40
CPU_FREQ=120
FPGA_VARIANT=85k
FPGA_PACKAGE=CABGA381
VERILOGS=$1

View File

@@ -182,14 +182,18 @@ void printk(uint64_t kx) {
void print_coremarks(uint64_t ticks) {
const uint64_t MHz = CLOCKS_PER_SEC/1000000;
printf("*** MHz : %d\n",(int)MHz);
printf("*** Ticks : %d\n",(int)ticks);
printf("*** MHz : %d\n",(int)MHz);
printf("*** Ticks : %d\n",(int)ticks);
uint64_t ksecs=ticks/(CLOCKS_PER_SEC/1000);
printf("*** Time : "); printk(ksecs); printf("\n");
printf("*** Time : "); printk(ksecs); printf("\n");
uint64_t kiter_per_sec= (uint64_t)(ITERATIONS*1000*1000)/ksecs;
printf("*** Iter/s : "); printk(kiter_per_sec); printf("\n");
printf("*** Coremark/s : "); printk(kiter_per_sec/MHz); printf("\n");
printf("*** Iter/s : "); printk(kiter_per_sec); printf("\n");
printf("*** Coremark/s : "); printk(kiter_per_sec/MHz); printf("\n");
uint64_t kticks2 = rdcycle() * (uint64_t)1000;
uint64_t instret2 = rdinstret();
printf("*** CPI (2) : "); printk(kticks2/instret2); printf("\n");
// This one is wrong, TODO: understand why
// printk((uint64_t)(ITERATIONS)*((uint64_t)CLOCKS_PER_SEC*1000)/(uint64_t)total_time);
}

View File

@@ -15,7 +15,7 @@
// then waits for the UART to be not busy. The number of iterations of
// the waiting loop can vary *A LOT* depending of the ratio between CPU
// frequency and UART baud rate.
#define NO_GRAPHIC
//#define NO_GRAPHIC
/*******************************************************************/

View File

@@ -8,7 +8,8 @@
`include "clockworks.v"
`include "emitter_uart.v"
//`define VERBOSE
//`define VERBOSE // uncomment to log pipeline for all executed instructions
//`define LOG_RAS // uncomment to log return address stack operations
/******************************************************************************/
@@ -248,8 +249,10 @@ module Processor (
reg [31:0] RegisterBank [0:31];
// integer depth = 0;
`ifdef LOG_RAS
integer depth = 0;
`endif
always @(posedge clk) begin
if(!D_stall) begin
@@ -265,22 +268,23 @@ module Processor (
RAS_1 <= RAS_0;
RAS_0 <= FD_PC + 4;
/*
`ifdef LOG_RAS
$write("***PC=%0h ",FD_PC);
riscv_disasm(FD_instr, FD_PC);
$write(" ");
$display("jal(%0h) push(%0h) depth=%0d",FD_PC+Jimm(FD_instr), FD_PC+4, depth);
depth <= depth + 1;
*/
`endif
end else if(isJALR(FD_instr) && rdId(FD_instr)==0 && (rs1Id(FD_instr) == 1 || rs1Id(FD_instr)==5)) begin
/*
`ifdef LOG_RAS
$write("***PC=%0h ",FD_PC);
riscv_disasm(FD_instr, FD_PC);
$write(" ");
$display("jalr pop() depth=%0d", depth);
depth <= depth - 1;
*/
`endif
RAS_0 <= RAS_1;
RAS_1 <= RAS_2;

View File

@@ -0,0 +1,662 @@
/**
* pipelineX.v
* Let us see how to morph our multi-cycle CPU into a pipelined CPU !
* Step X: Simplify for higher maxfreq and smaller area
* - register forwarding
* TODO: reintegrate branch prediction and return address stack
*/
`default_nettype none
`include "clockworks.v"
`include "emitter_uart.v"
/******************************************************************************/
module Processor (
input clk,
input resetn,
output [31:0] IO_mem_addr, // IO memory address
input [31:0] IO_mem_rdata, // data read from IO memory
output [31:0] IO_mem_wdata, // data written to IO memory
output IO_mem_wr // IO write flag
);
/******************************************************************************/
/*
Reminder for the 10 RISC-V codeops
----------------------------------
5'b01100 | ALUreg | rd <- rs1 OP rs2
5'b00100 | ALUimm | rd <- rs1 OP Iimm
5'b11000 | Branch | if(rs1 OP rs2) PC<-PC+Bimm
5'b11001 | JALR | rd <- PC+4; PC<-rs1+Iimm
5'b11011 | JAL | rd <- PC+4; PC<-PC+Jimm
5'b00101 | AUIPC | rd <- PC + Uimm
5'b01101 | LUI | rd <- Uimm
5'b00000 | Load | rd <- mem[rs1+Iimm]
5'b01000 | Store | mem[rs1+Simm] <- rs2
5'b11100 | SYSTEM | special
*/
/******************************************************************************/
reg [63:0] cycle;
reg [63:0] instret;
always @(posedge clk) begin
cycle <= !resetn ? 0 : cycle + 1;
end
wire D_flush;
wire E_flush;
wire F_stall;
wire D_stall;
wire halt; // Halt execution (on ebreak)
/******************************************************************************/
/*** F: Instruction fetch ***/
reg [31:0] PC;
reg [31:0] PROGROM[0:16383]; // 16384 4-bytes words
// 64 Kb of program ROM
initial begin
$readmemh("PROGROM.hex",PROGROM);
end
wire [31:0] F_PC =
D_JumpOrBranchNow ? D_JumpOrBranchAddr :
EM_JumpOrBranchNow ? EM_JumpOrBranchAddr :
PC;
always @(posedge clk) begin
if(!F_stall) begin
FD_instr <= PROGROM[F_PC[15:2]];
FD_PC <= F_PC;
PC <= F_PC+4;
end
FD_nop <= D_flush | !resetn;
if(!resetn) begin
PC <= 0;
end
end
/******************************************************************************/
/******************************************************************************/
reg [31:0] FD_PC;
reg [31:0] FD_instr;
reg FD_nop; // Needed because I cannot directly write NOP to FD_instr
// because FD_instr is plugged to PROGROM's output port.
/******************************************************************************/
/******************************************************************************/
/*** D: Instruction decode ***/
/** These three signals come from the Writeback stage **/
wire wbEnable;
wire [31:0] wbData;
wire [4:0] wbRdId;
wire [4:0] D_rdId = FD_instr[11:7];
wire [4:0] D_rs1Id = FD_instr[19:15];
wire [4:0] D_rs2Id = FD_instr[24:20];
// commented-out codeop recognizers are optimized below
// wire D_isJAL = (FD_instr[6:2]==5'b11011);
// wire D_isJALR = (FD_instr[6:2]==5'b11001);
// wire D_isAUIPC = (FD_instr[6:2]==5'b00101);
// wire D_isLUI = (FD_instr[6:2]==5'b01101);
// wire D_isBranch = (FD_instr[6:2]==5'b11000);
wire D_isALUreg = (FD_instr[6:2]==5'b01100);
wire D_isALUimm = (FD_instr[6:2]==5'b00100);
wire D_isLoad = (FD_instr[6:2]==5'b00000);
wire D_isStore = (FD_instr[6:2]==5'b01000);
wire D_isSYSTEM = (FD_instr[6:2]==5'b11100);
// optimized codop recognizers
wire D_isJAL = FD_instr[3];
wire D_isJALR = {FD_instr[6], FD_instr[3], FD_instr[2]} == 3'b101;
wire D_isLUI = FD_instr[6:4] == 3'b111;
wire D_isAUIPC = FD_instr[6:4] == 3'b101;
wire D_isBranch = {FD_instr[6], FD_instr[4], FD_instr[2]} == 3'b100;
wire D_isJALorJALR = (FD_instr[2] & FD_instr[6]);
wire D_isLUIorAUIPC = (FD_instr[4] & FD_instr[6]);
wire D_readsRs1 = !(D_isJAL || D_isLUIorAUIPC);
wire D_readsRs2 = (FD_instr[5] && (FD_instr[3:2] == 2'b00));
// <=> D_isALUreg || D_isBranch || D_isStore || D_isSYSTEM
wire [31:0] D_Uimm = { FD_instr[31],FD_instr[30:12], {12{1'b0}}};
wire [31:0] D_Bimm = {{20{FD_instr[31]}},
FD_instr[7],FD_instr[30:25],FD_instr[11:8],1'b0};
wire [31:0] D_Jimm = {{12{FD_instr[31]}},
FD_instr[19:12],FD_instr[20],FD_instr[30:21],1'b0};
// BTFNT (Backwards taken forwards not taken)
// I[31]=Bimm sgn (pred bkwd branch taken)
wire D_predictBranch = FD_instr[31];
wire D_JumpOrBranchNow = !FD_nop && (
D_isJAL || D_isJALR || (D_isBranch && D_predictBranch)
);
// Return address stack
reg [31:0] RAS_0;
reg [31:0] RAS_1;
reg [31:0] RAS_2;
reg [31:0] RAS_3;
wire [31:0] D_JumpOrBranchAddr =
D_isJALR ? RAS_0 :
(FD_PC + (D_isJAL ? D_Jimm : D_Bimm));
reg [31:0] RegisterBank [0:31];
always @(posedge clk) begin
DE_rdId <= D_rdId;
DE_rs1Id <= D_rs1Id;
DE_rs2Id <= D_rs2Id;
DE_funct3 <= FD_instr[14:12];
DE_funct3_is <= 8'b00000001 << FD_instr[14:12];
DE_funct7 <= FD_instr[30];
DE_csrId <= {FD_instr[27],FD_instr[21]};
DE_nop <= 1'b0;
if(!D_stall) begin
DE_isALUreg <= D_isALUreg;
DE_isALUimm <= D_isALUimm;
DE_isBranch <= D_isBranch;
DE_isJALR <= D_isJALR;
DE_isJAL <= D_isJAL;
DE_isAUIPC <= D_isAUIPC;
DE_isLUI <= D_isLUI;
DE_isLoad <= D_isLoad;
DE_isStore <= D_isStore;
DE_isCSRRS <= D_isSYSTEM && FD_instr[13];
DE_isEBREAK <= D_isSYSTEM && !FD_instr[13];
// wbEnable = !isBranch & !isStore
// Note: EM_wbEnable = DE_wbEnable && (rdId != 0)
DE_wbEnable <= (FD_instr[5:2] != 4'b1000);
end
if(E_flush | FD_nop) begin
DE_nop <= 1'b1;
DE_isALUreg <= 1'b0;
DE_isALUimm <= 1'b0;
DE_isBranch <= 1'b0;
DE_isJALR <= 1'b0;
DE_isJAL <= 1'b0;
DE_isAUIPC <= 1'b0;
DE_isLUI <= 1'b0;
DE_isLoad <= 1'b0;
DE_isStore <= 1'b0;
DE_isCSRRS <= 1'b0;
DE_isEBREAK <= 1'b0;
DE_wbEnable <= 1'b0;
end
if(wbEnable) begin
RegisterBank[wbRdId] <= wbData;
end
DE_IorSimm <= {
{21{FD_instr[31]}},
D_isStore ? {FD_instr[30:25],FD_instr[11:7]} :
FD_instr[30:20]
};
// Used in case of misprediction:
// PC+Bimm if branch forward, PC+4 if branch backward
DE_PCplus4orBimm <= FD_PC + (FD_instr[31] ? 4 : D_Bimm);
// DE_PCplus4orUimm =
// ((isLUI ? 0 : FD_PC)) + ((isJAL | isJALR) ? 4 : Uimm)
// (knowing that isLUI | isAUIPC | isJAL | isJALR)
DE_PCplus4orUimm <= ({32{FD_instr[6:5]!=2'b01}} & FD_PC) +
(D_isJALorJALR ? 4 : D_Uimm);
DE_isJALorJALRorLUIorAUIPC <= FD_instr[2];
DE_predictBranch <= D_predictBranch;
DE_predictRA <= RAS_0;
if(!D_stall && !FD_nop && !D_flush) begin
if(D_isJAL && D_rdId==1) begin
RAS_3 <= RAS_2;
RAS_2 <= RAS_1;
RAS_1 <= RAS_0;
RAS_0 <= FD_PC + 4;
end
if(D_isJALR && D_rdId==0 && (D_rs1Id == 1 || D_rs1Id==5)) begin
RAS_0 <= RAS_1;
RAS_1 <= RAS_2;
RAS_2 <= RAS_3;
end
end
end
/******************************************************************************/
/******************************************************************************/
reg DE_nop; // Needed by instret in W stage
reg [4:0] DE_rdId;
reg [4:0] DE_rs1Id;
reg [4:0] DE_rs2Id;
reg [1:0] DE_csrId;
reg [2:0] DE_funct3;
(* onehot *) reg [7:0] DE_funct3_is;
reg [5:5] DE_funct7;
reg [31:0] DE_IorSimm;
reg DE_isALUreg;
reg DE_isALUimm;
reg DE_isBranch;
reg DE_isJALR;
reg DE_isJAL;
reg DE_isAUIPC;
reg DE_isLUI;
reg DE_isLoad;
reg DE_isStore;
reg DE_isCSRRS;
reg DE_isEBREAK;
reg DE_wbEnable; // !isBranch && !isStore && rdId != 0
reg DE_isJALorJALRorLUIorAUIPC;
reg [31:0] DE_PCplusBorJimm;
reg [31:0] DE_PCplus4orBimm;
reg [31:0] DE_PCplus4orUimm;
reg DE_predictBranch;
reg [31:0] DE_predictRA;
/******************************************************************************/
/******************************************************************************/
/*** E: Execute ***/
/*********** Registrer forwarding ************************************/
wire E_M_fwd_rs1 = EM_wbEnable && (EM_rdId == DE_rs1Id);
wire E_W_fwd_rs1 = MW_wbEnable && (MW_rdId == DE_rs1Id);
wire E_M_fwd_rs2 = EM_wbEnable && (EM_rdId == DE_rs2Id);
wire E_W_fwd_rs2 = MW_wbEnable && (MW_rdId == DE_rs2Id);
wire [31:0] E_rs1 = E_M_fwd_rs1 ? EM_Eresult :
E_W_fwd_rs1 ? wbData :
RegisterBank[DE_rs1Id] ;
wire [31:0] E_rs2 = E_M_fwd_rs2 ? EM_Eresult :
E_W_fwd_rs2 ? wbData :
RegisterBank[DE_rs2Id] ;
/*********** the ALU *************************************************/
wire [31:0] E_aluIn1 = E_rs1;
wire [31:0] E_aluIn2 = (DE_isALUreg | DE_isBranch) ? E_rs2 : DE_IorSimm;
wire [4:0] E_shamt = DE_isALUreg ? E_rs2[4:0] : DE_rs2Id;
wire E_minus = DE_funct7[5] & DE_isALUreg;
wire E_arith_shift = DE_funct7[5];
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] E_aluPlus = E_aluIn1 + E_aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] E_aluMinus = {1'b1, ~E_aluIn2} + {1'b0,E_aluIn1} + 33'b1;
wire E_LT =
(E_aluIn1[31] ^ E_aluIn2[31]) ? E_aluIn1[31] : E_aluMinus[32];
wire E_LTU = E_aluMinus[32];
wire E_EQ = (E_aluMinus[31:0] == 0);
// Flip a 32 bit word. Used by the shifter (a single shifter for
// left and right shifts, saves silicium !)
function [31:0] flip32;
input [31:0] x;
flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7],
x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15],
x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
endfunction
wire [31:0] E_shifter_in = (DE_funct3==3'b001) ? flip32(E_aluIn1) : E_aluIn1;
/* verilator lint_off WIDTH */
wire [31:0] E_shifter =
$signed({E_arith_shift & E_aluIn1[31], E_shifter_in}) >>> E_aluIn2[4:0];
/* verilator lint_on WIDTH */
wire [31:0] E_leftshift = flip32(E_shifter);
wire [31:0] E_aluOut =
(DE_funct3_is[0] ? (E_minus ? E_aluMinus[31:0] : E_aluPlus) : 32'b0) |
(DE_funct3_is[1] ? E_leftshift : 32'b0) |
(DE_funct3_is[2] ? {31'b0, E_LT } : 32'b0) |
(DE_funct3_is[3] ? {31'b0, E_LTU} : 32'b0) |
(DE_funct3_is[4] ? E_aluIn1 ^ E_aluIn2 : 32'b0) |
(DE_funct3_is[5] ? E_shifter : 32'b0) |
(DE_funct3_is[6] ? E_aluIn1 | E_aluIn2 : 32'b0) |
(DE_funct3_is[7] ? E_aluIn1 & E_aluIn2 : 32'b0) ;
/*********** Branch, JAL, JALR ***********************************/
wire E_takeBranch =
(DE_funct3_is[0] & E_EQ ) | // BEQ
(DE_funct3_is[1] & !E_EQ ) | // BNE
(DE_funct3_is[4] & E_LT ) | // BLT
(DE_funct3_is[5] & !E_LT ) | // BGE
(DE_funct3_is[6] & E_LTU) | // BLTU
(DE_funct3_is[7] & !E_LTU) ; // BGEU
wire [31:0] E_JALRaddr = {E_aluPlus[31:1],1'b0};
wire E_JumpOrBranch = (
(DE_isJALR && (DE_predictRA != E_JALRaddr) ) ||
(DE_isBranch && (E_takeBranch^DE_predictBranch))
);
wire [31:0] E_JumpOrBranchAddr = DE_isBranch ? DE_PCplus4orBimm : E_JALRaddr;
wire [31:0] E_result =
DE_isJALorJALRorLUIorAUIPC ? DE_PCplus4orUimm : E_aluOut;
wire [31:0] E_addr = E_rs1 + DE_IorSimm;
/**************************************************************/
always @(posedge clk) begin
EM_nop <= DE_nop;
EM_rdId <= DE_rdId;
EM_rs1Id <= DE_rs1Id;
EM_rs2Id <= DE_rs2Id;
EM_funct3 <= DE_funct3;
EM_csrId_is <= 4'b0001 << DE_csrId;
EM_rs2 <= E_rs2;
EM_Eresult <= E_result;
EM_addr <= E_addr;
EM_Mdata <= DATARAM[E_addr[15:2]];
EM_isLoad <= DE_isLoad;
EM_isStore <= DE_isStore;
EM_isCSRRS <= DE_isCSRRS;
EM_wbEnable <= DE_wbEnable && (DE_rdId != 0);
EM_JumpOrBranchNow <= E_JumpOrBranch;
EM_JumpOrBranchAddr <= E_JumpOrBranchAddr;
end
assign halt = resetn & DE_isEBREAK;
/******************************************************************************/
/******************************************************************************/
reg EM_nop; // Needed by instret in W stage
reg [4:0] EM_rdId;
reg [4:0] EM_rs1Id;
reg [4:0] EM_rs2Id;
(* onehot *) reg [3:0] EM_csrId_is;
reg [2:0] EM_funct3;
reg [31:0] EM_rs2;
reg [31:0] EM_Eresult;
reg [31:0] EM_addr;
reg [31:0] EM_Mdata;
reg EM_isStore;
reg EM_isLoad;
reg EM_isCSRRS;
reg EM_wbEnable;
reg EM_JumpOrBranchNow;
reg [31:0] EM_JumpOrBranchAddr;
/******************************************************************************/
/******************************************************************************/
/*** M: Memory ***/
wire M_isB = (EM_funct3[1:0] == 2'b00);
wire M_isH = (EM_funct3[1:0] == 2'b01);
/*************** STORE **************************/
wire [31:0] M_STORE_data;
assign M_STORE_data[ 7: 0] = EM_rs2[7:0];
assign M_STORE_data[15: 8] = EM_addr[0] ? EM_rs2[7:0] : EM_rs2[15: 8] ;
assign M_STORE_data[23:16] = EM_addr[1] ? EM_rs2[7:0] : EM_rs2[23:16] ;
assign M_STORE_data[31:24] = EM_addr[0] ? EM_rs2[7:0] :
EM_addr[1] ? EM_rs2[15:8] : EM_rs2[31:24] ;
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on EM_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on EM_addr[1:0])
wire [3:0] M_STORE_wmask = M_isB ?
(EM_addr[1] ?
(EM_addr[0] ? 4'b1000 : 4'b0100) :
(EM_addr[0] ? 4'b0010 : 4'b0001)
) :
M_isH ? (EM_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111 ;
wire M_isIO = EM_addr[22];
wire M_isRAM = !M_isIO;
assign IO_mem_addr = EM_addr;
assign IO_mem_wr = EM_isStore && M_isIO; // && M_STORE_wmask[0];
assign IO_mem_wdata = EM_rs2;
wire [3:0] M_wmask = {4{EM_isStore & M_isRAM}} & M_STORE_wmask;
reg [31:0] DATARAM [0:16383]; // 16384 4-bytes words
// 64 Kb of data RAM in total
wire [13:0] M_word_addr = EM_addr[15:2];
always @(posedge clk) begin
if(M_wmask[0]) DATARAM[M_word_addr][ 7:0 ] <= M_STORE_data[ 7:0 ];
if(M_wmask[1]) DATARAM[M_word_addr][15:8 ] <= M_STORE_data[15:8 ];
if(M_wmask[2]) DATARAM[M_word_addr][23:16] <= M_STORE_data[23:16];
if(M_wmask[3]) DATARAM[M_word_addr][31:24] <= M_STORE_data[31:24];
end
wire M_sext = !EM_funct3[2];
/*************** LOAD ****************************/
wire [15:0] M_LOAD_H=EM_addr[1] ? EM_Mdata[31:16]: EM_Mdata[15:0];
wire [7:0] M_LOAD_B=EM_addr[0] ? M_LOAD_H[15:8] : M_LOAD_H[7:0];
wire M_LOAD_sign=M_sext & (M_isB ? M_LOAD_B[7] : M_LOAD_H[15]);
wire [31:0] M_Mdata = M_isB ? {{24{M_LOAD_sign}},M_LOAD_B} :
M_isH ? {{16{M_LOAD_sign}},M_LOAD_H} :
EM_Mdata ;
wire [31:0] M_CSR_data =
(EM_csrId_is[0] ? cycle[31:0] : 32'b0) |
(EM_csrId_is[2] ? cycle[63:32] : 32'b0) |
(EM_csrId_is[1] ? instret[31:0] : 32'b0) |
(EM_csrId_is[3] ? instret[63:32] : 32'b0) ;
initial begin
$readmemh("DATARAM.hex",DATARAM);
end
always @(posedge clk) begin
MW_nop <= EM_nop;
MW_rdId <= EM_rdId;
MW_wbData <=
EM_isLoad ? (M_isIO ? IO_mem_rdata : M_Mdata) :
EM_isCSRRS ? M_CSR_data :
EM_Eresult;
MW_wbEnable <= EM_wbEnable;
if(!resetn) begin
instret <= 0;
end else if(!MW_nop) begin
// It's easier to count the retired instructions when
// they *exit* the pipeline (but it requires to pass
// a _nop flag through the pipeline).
instret <= instret + 1;
end
end
/******************************************************************************/
/******************************************************************************/
reg MW_nop; // Needed by instret in W stage
reg [4:0] MW_rdId;
reg [31:0] MW_wbData;
reg MW_wbEnable;
/******************************************************************************/
/******************************************************************************/
/*** W: WriteBack ***/
assign wbData = MW_wbData;
assign wbEnable = MW_wbEnable;
assign wbRdId = MW_rdId;
/******************************************************************************/
// we do not test rdId == 0 because in general, one loads data to
// a register, not to zero !
wire rs1Hazard = D_readsRs1 && (D_rs1Id == DE_rdId);
wire rs2Hazard = D_readsRs2 && (D_rs2Id == DE_rdId);
// we could generate slightly more bubble with
// simpler test (to be used if critical path is here)
// -> keeping this one (seems it has no influence on CPI,
// and results in slightly better timings)
// wire rs1Hazard = (D_rs1Id == DE_rdId);
// wire rs2Hazard = (D_rs2Id == DE_rdId);
// we are not obliged to compare all bits !
// wire rs1Hazard = (D_rs1Id[3:0] == DE_rdId[3:0]);
// wire rs2Hazard = (D_rs2Id[3:0] == DE_rdId[3:0]);
// Add bubble only if next instr uses result of latency-2 instr
wire dataHazard = !FD_nop && (DE_isLoad || DE_isCSRRS) &&
(rs1Hazard || rs2Hazard);
// (other option: always add bubble after latency-2 instr
// like Samsoniuk's DarkRiscV). Reduces critical path.
// wire dataHazard = !FD_nop && (DE_isLoad || DE_isCSRRS);
assign F_stall = dataHazard | halt;
assign D_stall = dataHazard | halt;
// Here we need to use E_JumpOrBranch (the registered version
// DE_JumpOrBranch is not ready on time).
assign D_flush = E_JumpOrBranch;
assign E_flush = E_JumpOrBranch | dataHazard;
/******************************************************************************/
`ifdef BENCH
always @(posedge clk) begin
if(halt) $finish();
end
`endif
/******************************************************************************/
endmodule
module SOC (
input CLK, // system clock
input RESET,// reset button
output reg [4:0] LEDS, // system LEDs
input RXD, // UART receive
output TXD // UART transmit
);
wire clk;
wire resetn;
wire [31:0] IO_mem_addr;
wire [31:0] IO_mem_rdata;
wire [31:0] IO_mem_wdata;
wire IO_mem_wr;
Processor CPU(
.clk(clk),
.resetn(resetn),
.IO_mem_addr(IO_mem_addr),
.IO_mem_rdata(IO_mem_rdata),
.IO_mem_wdata(IO_mem_wdata),
.IO_mem_wr(IO_mem_wr)
);
wire [13:0] IO_wordaddr = IO_mem_addr[15:2];
// Memory-mapped IO in IO page, 1-hot addressing in word address.
localparam IO_LEDS_bit = 0; // W five leds
localparam IO_UART_DAT_bit = 1; // W data to send (8 bits)
localparam IO_UART_CNTL_bit = 2; // R status. bit 9: busy sending
always @(posedge clk) begin
if(IO_mem_wr & IO_wordaddr[IO_LEDS_bit]) begin
LEDS <= IO_mem_wdata[4:0];
end
end
wire uart_valid = IO_mem_wr & IO_wordaddr[IO_UART_DAT_bit];
wire uart_ready;
corescore_emitter_uart #(
.clk_freq_hz(`CPU_FREQ*1000000),
.baud_rate(1000000)
) UART(
.i_clk(clk),
.i_rst(!resetn),
.i_data(IO_mem_wdata[7:0]),
.i_valid(uart_valid),
.o_ready(uart_ready),
.o_uart_tx(TXD)
);
assign IO_mem_rdata =
IO_wordaddr[IO_UART_CNTL_bit] ? { 22'b0, !uart_ready, 9'b0}
: 32'b0;
`ifdef BENCH
always @(posedge clk) begin
if(uart_valid) begin
$write("%c", IO_mem_wdata[7:0] );
$fflush(32'h8000_0001);
end
end
`endif
// Gearbox and reset circuitry.
Clockworks CW(
.CLK(CLK),
.RESET(RESET),
.clk(clk),
.resetn(resetn)
);
endmodule

View File

@@ -113,10 +113,10 @@ module Processor (
// wire D_isJALR = (FD_instr[6:2]==5'b11001);
// wire D_isAUIPC = (FD_instr[6:2]==5'b00101);
// wire D_isLUI = (FD_instr[6:2]==5'b01101);
// wire D_isBranch = (FD_instr[6:2]==5'b11000);
// wire D_isBranch = (FD_instr[6:2]==5'b11000);
// wire D_isLoad = (FD_instr[6:2]==5'b00000);
wire D_isALUreg = (FD_instr[6:2]==5'b01100);
wire D_isALUimm = (FD_instr[6:2]==5'b00100);
wire D_isLoad = (FD_instr[6:2]==5'b00000);
wire D_isStore = (FD_instr[6:2]==5'b01000);
wire D_isSYSTEM = (FD_instr[6:2]==5'b11100);
@@ -126,7 +126,7 @@ module Processor (
wire D_isLUI = FD_instr[6:4] == 3'b111;
wire D_isAUIPC = FD_instr[6:4] == 3'b101;
wire D_isBranch = {FD_instr[6], FD_instr[4], FD_instr[2]} == 3'b100;
wire D_isLoad = !|FD_instr[6:2];
wire D_isJALorJALR = (FD_instr[2] & FD_instr[6]);
wire D_isLUIorAUIPC = (FD_instr[4] & FD_instr[6]);
@@ -224,7 +224,7 @@ module Processor (
DE_isJALorJALRorLUIorAUIPC <= FD_instr[2];
DE_back <= FD_instr[31]; // Bimm sign (pred=bkwd branch taken)
DE_predictBranch <= FD_instr[31]; // Bimm sign (pred=bkwd branch taken)
end
/******************************************************************************/
@@ -256,11 +256,10 @@ module Processor (
reg DE_wbEnable; // !isBranch && !isStore && rdId != 0
reg DE_isJALorJALRorLUIorAUIPC;
reg [31:0] DE_PCplusBorJimm;
reg [31:0] DE_PCplus4orBimm;
reg [31:0] DE_PCplus4orUimm;
reg DE_back;
reg DE_predictBranch;
/******************************************************************************/
/******************************************************************************/
@@ -344,7 +343,7 @@ module Processor (
wire E_JumpOrBranch = (
DE_isJALR ||
((DE_isBranch) && (E_takeBranch^DE_back))
((DE_isBranch) && (E_takeBranch^DE_predictBranch))
);
wire [31:0] E_JumpOrBranchAddr =
@@ -479,8 +478,6 @@ module Processor (
always @(posedge clk) begin
MW_nop <= EM_nop;
MW_rdId <= EM_rdId;
MW_rs1Id <= EM_rs1Id;
MW_rs2Id <= EM_rs2Id;
MW_wbData <=
EM_isLoad ? (M_isIO ? IO_mem_rdata : M_Mdata) :
@@ -503,8 +500,6 @@ module Processor (
/******************************************************************************/
reg MW_nop; // Needed by instret in W stage
reg [4:0] MW_rdId;
reg [4:0] MW_rs1Id;
reg [4:0] MW_rs2Id;
reg [31:0] MW_wbData;
reg MW_wbEnable;
/******************************************************************************/
@@ -520,15 +515,15 @@ module Processor (
// we do not test rdId == 0 because in general, one loads data to
// a register, not to zero !
// wire rs1Hazard = D_readsRs1 && (D_rs1Id == DE_rdId);
// wire rs2Hazard = D_readsRs2 && (D_rs2Id == DE_rdId);
wire rs1Hazard = D_readsRs1 && (D_rs1Id == DE_rdId);
wire rs2Hazard = D_readsRs2 && (D_rs2Id == DE_rdId);
// we could generate slightly more bubble with
// simpler test (to be used if critical path is here)
// -> keeping this one (seems it has no influence on CPI,
// and results in slightly better timings)
wire rs1Hazard = (D_rs1Id == DE_rdId);
wire rs2Hazard = (D_rs2Id == DE_rdId);
// wire rs1Hazard = (D_rs1Id == DE_rdId);
// wire rs2Hazard = (D_rs2Id == DE_rdId);
// we are not obliged to compare all bits !
// wire rs1Hazard = (D_rs1Id[3:0] == DE_rdId[3:0]);

View File

@@ -0,0 +1,618 @@
/**
* pipeline6.v
* Let us see how to morph our multi-cycle CPU into a pipelined CPU !
* Step X: Simplify for higher maxfreq and smaller area
* - register forwarding
* TODO: reintegrate branch prediction and return address stack
*/
`default_nettype none
`include "clockworks.v"
`include "emitter_uart.v"
/******************************************************************************/
module Processor (
input clk,
input resetn,
output [31:0] IO_mem_addr, // IO memory address
input [31:0] IO_mem_rdata, // data read from IO memory
output [31:0] IO_mem_wdata, // data written to IO memory
output IO_mem_wr // IO write flag
);
/******************************************************************************/
/*
Reminder for the 10 RISC-V codeops
----------------------------------
5'b01100 | ALUreg | rd <- rs1 OP rs2
5'b00100 | ALUimm | rd <- rs1 OP Iimm
5'b11000 | Branch | if(rs1 OP rs2) PC<-PC+Bimm
5'b11001 | JALR | rd <- PC+4; PC<-rs1+Iimm
5'b11011 | JAL | rd <- PC+4; PC<-PC+Jimm
5'b00101 | AUIPC | rd <- PC + Uimm
5'b01101 | LUI | rd <- Uimm
5'b00000 | Load | rd <- mem[rs1+Iimm]
5'b01000 | Store | mem[rs1+Simm] <- rs2
5'b11100 | SYSTEM | special
*/
/******************************************************************************/
reg [63:0] cycle;
reg [63:0] instret;
always @(posedge clk) begin
cycle <= !resetn ? 0 : cycle + 1;
end
wire D_flush;
wire E_flush;
wire F_stall;
wire D_stall;
wire halt; // Halt execution (on ebreak)
/******************************************************************************/
/*** F: Instruction fetch ***/
reg [31:0] PC;
reg [31:0] PROGROM[0:16383]; // 16384 4-bytes words
// 64 Kb of program ROM
initial begin
$readmemh("PROGROM.hex",PROGROM);
end
wire [31:0] F_PC = EM_JumpOrBranchNow ? EM_JumpOrBranchAddr : PC;
always @(posedge clk) begin
if(!F_stall) begin
FD_instr <= PROGROM[F_PC[15:2]];
FD_PC <= F_PC;
PC <= F_PC+4;
end
FD_nop <= D_flush | !resetn;
if(!resetn) begin
PC <= 0;
end
end
/******************************************************************************/
/******************************************************************************/
reg [31:0] FD_PC;
reg [31:0] FD_instr;
reg FD_nop; // Needed because I cannot directly write NOP to FD_instr
// because FD_instr is plugged to PROGROM's output port.
/******************************************************************************/
/******************************************************************************/
/*** D: Instruction decode ***/
/** These three signals come from the Writeback stage **/
wire wbEnable;
wire [31:0] wbData;
wire [4:0] wbRdId;
wire [4:0] D_rdId = FD_instr[11:7];
wire [4:0] D_rs1Id = FD_instr[19:15];
wire [4:0] D_rs2Id = FD_instr[24:20];
// commented-out codeop recognizers are optimized below
// wire D_isJAL = (FD_instr[6:2]==5'b11011);
// wire D_isJALR = (FD_instr[6:2]==5'b11001);
// wire D_isAUIPC = (FD_instr[6:2]==5'b00101);
// wire D_isLUI = (FD_instr[6:2]==5'b01101);
// wire D_isBranch = (FD_instr[6:2]==5'b11000);
// wire D_isLoad = (FD_instr[6:2]==5'b00000);
wire D_isALUreg = (FD_instr[6:2]==5'b01100);
wire D_isALUimm = (FD_instr[6:2]==5'b00100);
wire D_isStore = (FD_instr[6:2]==5'b01000);
wire D_isSYSTEM = (FD_instr[6:2]==5'b11100);
// optimized codop recognizers
wire D_isJAL = FD_instr[3];
wire D_isJALR = {FD_instr[6], FD_instr[3], FD_instr[2]} == 3'b101;
wire D_isLUI = FD_instr[6:4] == 3'b111;
wire D_isAUIPC = FD_instr[6:4] == 3'b101;
wire D_isBranch = {FD_instr[6], FD_instr[4], FD_instr[2]} == 3'b100;
wire D_isLoad = !|FD_instr[6:2];
wire D_isJALorJALR = (FD_instr[2] & FD_instr[6]);
wire D_isLUIorAUIPC = (FD_instr[4] & FD_instr[6]);
wire D_readsRs1 = !(D_isJAL || D_isLUIorAUIPC);
wire D_readsRs2 = (FD_instr[5] && (FD_instr[3:2] == 2'b00));
// <=> D_isALUreg || D_isBranch || D_isStore || D_isSYSTEM
wire [31:0] D_Uimm = { FD_instr[31],FD_instr[30:12], {12{1'b0}}};
wire [31:0] D_Bimm = {{20{FD_instr[31]}},
FD_instr[7],FD_instr[30:25],FD_instr[11:8],1'b0};
wire [31:0] D_Jimm = {{12{FD_instr[31]}},
FD_instr[19:12],FD_instr[20],FD_instr[30:21],1'b0};
reg [31:0] RegisterBank [0:31];
always @(posedge clk) begin
DE_rdId <= D_rdId;
DE_rs1Id <= D_rs1Id;
DE_rs2Id <= D_rs2Id;
DE_funct3 <= FD_instr[14:12];
DE_funct3_is <= 8'b00000001 << FD_instr[14:12];
DE_funct7 <= FD_instr[30];
DE_csrId <= {FD_instr[27],FD_instr[21]};
DE_nop <= 1'b0;
if(!D_stall) begin
DE_isALUreg <= D_isALUreg;
DE_isALUimm <= D_isALUimm;
DE_isBranch <= D_isBranch;
DE_isJALR <= D_isJALR;
DE_isJAL <= D_isJAL;
DE_isAUIPC <= D_isAUIPC;
DE_isLUI <= D_isLUI;
DE_isLoad <= D_isLoad;
DE_isStore <= D_isStore;
DE_isCSRRS <= D_isSYSTEM && FD_instr[13];
DE_isEBREAK <= D_isSYSTEM && !FD_instr[13];
// wbEnable = !isBranch & !isStore
// Note: EM_wbEnable = DE_wbEnable && (rdId != 0)
DE_wbEnable <= (FD_instr[5:2] != 4'b1000);
end
if(E_flush | FD_nop) begin
DE_nop <= 1'b1;
DE_isALUreg <= 1'b0;
DE_isALUimm <= 1'b0;
DE_isBranch <= 1'b0;
DE_isJALR <= 1'b0;
DE_isJAL <= 1'b0;
DE_isAUIPC <= 1'b0;
DE_isLUI <= 1'b0;
DE_isLoad <= 1'b0;
DE_isStore <= 1'b0;
DE_isCSRRS <= 1'b0;
DE_isEBREAK <= 1'b0;
DE_wbEnable <= 1'b0;
end
if(wbEnable) begin
RegisterBank[wbRdId] <= wbData;
end
DE_IorSimm <= {
{21{FD_instr[31]}},
D_isStore ? {FD_instr[30:25],FD_instr[11:7]} :
FD_instr[30:20]
};
// DE_PCplus4orUimm =
// ((isLUI ? 0 : FD_PC)) + ((isJAL | isJALR) ? 4 : Uimm)
// (knowing that isLUI | isAUIPC | isJAL | isJALR)
DE_PCplus4orUimm <= ({32{FD_instr[6:5]!=2'b01}} & FD_PC) +
(D_isJALorJALR ? 4 : D_Uimm);
DE_PCplusBorJimm <= FD_PC + (D_isJAL ? D_Jimm : D_Bimm);
DE_isJALorJALRorLUIorAUIPC <= FD_instr[2];
end
/******************************************************************************/
/******************************************************************************/
reg DE_nop; // Needed by instret in W stage
reg [4:0] DE_rdId;
reg [4:0] DE_rs1Id;
reg [4:0] DE_rs2Id;
reg [1:0] DE_csrId;
reg [2:0] DE_funct3;
(* onehot *) reg [7:0] DE_funct3_is;
reg [5:5] DE_funct7;
reg [31:0] DE_IorSimm;
reg DE_isALUreg;
reg DE_isALUimm;
reg DE_isBranch;
reg DE_isJALR;
reg DE_isJAL;
reg DE_isAUIPC;
reg DE_isLUI;
reg DE_isLoad;
reg DE_isStore;
reg DE_isCSRRS;
reg DE_isEBREAK;
reg DE_wbEnable; // !isBranch && !isStore && rdId != 0
reg DE_isJALorJALRorLUIorAUIPC;
reg [31:0] DE_PCplus4orUimm;
reg [31:0] DE_PCplusBorJimm;
/******************************************************************************/
/******************************************************************************/
/*** E: Execute ***/
/*********** Registrer forwarding ************************************/
wire E_M_fwd_rs1 = EM_wbEnable && (EM_rdId == DE_rs1Id);
wire E_W_fwd_rs1 = MW_wbEnable && (MW_rdId == DE_rs1Id);
wire E_M_fwd_rs2 = EM_wbEnable && (EM_rdId == DE_rs2Id);
wire E_W_fwd_rs2 = MW_wbEnable && (MW_rdId == DE_rs2Id);
wire [31:0] E_rs1 = E_M_fwd_rs1 ? EM_Eresult :
E_W_fwd_rs1 ? wbData :
RegisterBank[DE_rs1Id] ;
wire [31:0] E_rs2 = E_M_fwd_rs2 ? EM_Eresult :
E_W_fwd_rs2 ? wbData :
RegisterBank[DE_rs2Id] ;
/*********** the ALU *************************************************/
wire [31:0] E_aluIn1 = E_rs1;
wire [31:0] E_aluIn2 = (DE_isALUreg | DE_isBranch) ? E_rs2 : DE_IorSimm;
wire [4:0] E_shamt = DE_isALUreg ? E_rs2[4:0] : DE_rs2Id;
wire E_minus = DE_funct7[5] & DE_isALUreg;
wire E_arith_shift = DE_funct7[5];
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] E_aluPlus = E_aluIn1 + E_aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] E_aluMinus = {1'b1, ~E_aluIn2} + {1'b0,E_aluIn1} + 33'b1;
wire E_LT =
(E_aluIn1[31] ^ E_aluIn2[31]) ? E_aluIn1[31] : E_aluMinus[32];
wire E_LTU = E_aluMinus[32];
wire E_EQ = (E_aluMinus[31:0] == 0);
// Flip a 32 bit word. Used by the shifter (a single shifter for
// left and right shifts, saves silicium !)
function [31:0] flip32;
input [31:0] x;
flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7],
x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15],
x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
endfunction
wire [31:0] E_shifter_in = (DE_funct3==3'b001) ? flip32(E_aluIn1) : E_aluIn1;
/* verilator lint_off WIDTH */
wire [31:0] E_shifter =
$signed({E_arith_shift & E_aluIn1[31], E_shifter_in}) >>> E_aluIn2[4:0];
/* verilator lint_on WIDTH */
wire [31:0] E_leftshift = flip32(E_shifter);
wire [31:0] E_aluOut =
(DE_funct3_is[0] ? (E_minus ? E_aluMinus[31:0] : E_aluPlus) : 32'b0) |
(DE_funct3_is[1] ? E_leftshift : 32'b0) |
(DE_funct3_is[2] ? {31'b0, E_LT } : 32'b0) |
(DE_funct3_is[3] ? {31'b0, E_LTU} : 32'b0) |
(DE_funct3_is[4] ? E_aluIn1 ^ E_aluIn2 : 32'b0) |
(DE_funct3_is[5] ? E_shifter : 32'b0) |
(DE_funct3_is[6] ? E_aluIn1 | E_aluIn2 : 32'b0) |
(DE_funct3_is[7] ? E_aluIn1 & E_aluIn2 : 32'b0) ;
/*********** Branch, JAL, JALR ***********************************/
wire E_takeBranch =
(DE_funct3_is[0] & E_EQ ) | // BEQ
(DE_funct3_is[1] & !E_EQ ) | // BNE
(DE_funct3_is[4] & E_LT ) | // BLT
(DE_funct3_is[5] & !E_LT ) | // BGE
(DE_funct3_is[6] & E_LTU) | // BLTU
(DE_funct3_is[7] & !E_LTU) ; // BGEU
wire E_JumpOrBranch = (
DE_isJAL || DE_isJALR ||
(DE_isBranch && E_takeBranch)
);
wire [31:0] E_JumpOrBranchAddr =
DE_isJALR ? {E_aluPlus[31:1],1'b0} : DE_PCplusBorJimm;
wire [31:0] E_result =
DE_isJALorJALRorLUIorAUIPC ? DE_PCplus4orUimm : E_aluOut;
wire [31:0] E_addr = E_rs1 + DE_IorSimm;
/**************************************************************/
always @(posedge clk) begin
EM_nop <= DE_nop;
EM_rdId <= DE_rdId;
EM_rs1Id <= DE_rs1Id;
EM_rs2Id <= DE_rs2Id;
EM_funct3 <= DE_funct3;
EM_csrId_is <= 4'b0001 << DE_csrId;
EM_rs2 <= E_rs2;
EM_Eresult <= E_result;
EM_addr <= E_addr;
EM_Mdata <= DATARAM[E_addr[15:2]];
EM_isLoad <= DE_isLoad;
EM_isStore <= DE_isStore;
EM_isCSRRS <= DE_isCSRRS;
EM_wbEnable <= DE_wbEnable && (DE_rdId != 0);
EM_JumpOrBranchNow <= E_JumpOrBranch;
EM_JumpOrBranchAddr <= E_JumpOrBranchAddr;
end
assign halt = resetn & DE_isEBREAK;
/******************************************************************************/
/******************************************************************************/
reg EM_nop; // Needed by instret in W stage
reg [4:0] EM_rdId;
reg [4:0] EM_rs1Id;
reg [4:0] EM_rs2Id;
(* onehot *) reg [3:0] EM_csrId_is;
reg [2:0] EM_funct3;
reg [31:0] EM_rs2;
reg [31:0] EM_Eresult;
reg [31:0] EM_addr;
reg [31:0] EM_Mdata;
reg EM_isStore;
reg EM_isLoad;
reg EM_isCSRRS;
reg EM_wbEnable;
reg EM_JumpOrBranchNow;
reg [31:0] EM_JumpOrBranchAddr;
/******************************************************************************/
/******************************************************************************/
/*** M: Memory ***/
wire M_isB = (EM_funct3[1:0] == 2'b00);
wire M_isH = (EM_funct3[1:0] == 2'b01);
/*************** STORE **************************/
wire [31:0] M_STORE_data;
assign M_STORE_data[ 7: 0] = EM_rs2[7:0];
assign M_STORE_data[15: 8] = EM_addr[0] ? EM_rs2[7:0] : EM_rs2[15: 8] ;
assign M_STORE_data[23:16] = EM_addr[1] ? EM_rs2[7:0] : EM_rs2[23:16] ;
assign M_STORE_data[31:24] = EM_addr[0] ? EM_rs2[7:0] :
EM_addr[1] ? EM_rs2[15:8] : EM_rs2[31:24] ;
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on EM_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on EM_addr[1:0])
wire [3:0] M_STORE_wmask = M_isB ?
(EM_addr[1] ?
(EM_addr[0] ? 4'b1000 : 4'b0100) :
(EM_addr[0] ? 4'b0010 : 4'b0001)
) :
M_isH ? (EM_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111 ;
wire M_isIO = EM_addr[22];
wire M_isRAM = !M_isIO;
assign IO_mem_addr = EM_addr;
assign IO_mem_wr = EM_isStore && M_isIO; // && M_STORE_wmask[0];
assign IO_mem_wdata = EM_rs2;
wire [3:0] M_wmask = {4{EM_isStore & M_isRAM}} & M_STORE_wmask;
reg [31:0] DATARAM [0:16383]; // 16384 4-bytes words
// 64 Kb of data RAM in total
wire [13:0] M_word_addr = EM_addr[15:2];
always @(posedge clk) begin
if(M_wmask[0]) DATARAM[M_word_addr][ 7:0 ] <= M_STORE_data[ 7:0 ];
if(M_wmask[1]) DATARAM[M_word_addr][15:8 ] <= M_STORE_data[15:8 ];
if(M_wmask[2]) DATARAM[M_word_addr][23:16] <= M_STORE_data[23:16];
if(M_wmask[3]) DATARAM[M_word_addr][31:24] <= M_STORE_data[31:24];
end
wire M_sext = !EM_funct3[2];
/*************** LOAD ****************************/
wire [15:0] M_LOAD_H=EM_addr[1] ? EM_Mdata[31:16]: EM_Mdata[15:0];
wire [7:0] M_LOAD_B=EM_addr[0] ? M_LOAD_H[15:8] : M_LOAD_H[7:0];
wire M_LOAD_sign=M_sext & (M_isB ? M_LOAD_B[7] : M_LOAD_H[15]);
wire [31:0] M_Mdata = M_isB ? {{24{M_LOAD_sign}},M_LOAD_B} :
M_isH ? {{16{M_LOAD_sign}},M_LOAD_H} :
EM_Mdata ;
wire [31:0] M_CSR_data =
(EM_csrId_is[0] ? cycle[31:0] : 32'b0) |
(EM_csrId_is[2] ? cycle[63:32] : 32'b0) |
(EM_csrId_is[1] ? instret[31:0] : 32'b0) |
(EM_csrId_is[3] ? instret[63:32] : 32'b0) ;
initial begin
$readmemh("DATARAM.hex",DATARAM);
end
always @(posedge clk) begin
MW_nop <= EM_nop;
MW_rdId <= EM_rdId;
MW_wbData <=
EM_isLoad ? (M_isIO ? IO_mem_rdata : M_Mdata) :
EM_isCSRRS ? M_CSR_data :
EM_Eresult;
MW_wbEnable <= EM_wbEnable;
if(!resetn) begin
instret <= 0;
end else if(!MW_nop) begin
// It's easier to count the retired instructions when
// they *exit* the pipeline (but it requires to pass
// a _nop flag through the pipeline).
instret <= instret + 1;
end
end
/******************************************************************************/
/******************************************************************************/
reg MW_nop; // Needed by instret in W stage
reg [4:0] MW_rdId;
reg [31:0] MW_wbData;
reg MW_wbEnable;
/******************************************************************************/
/******************************************************************************/
/*** W: WriteBack ***/
assign wbData = MW_wbData;
assign wbEnable = MW_wbEnable;
assign wbRdId = MW_rdId;
/******************************************************************************/
// we do not test rdId == 0 because in general, one loads data to
// a register, not to zero !
wire rs1Hazard = D_readsRs1 && (D_rs1Id == DE_rdId);
wire rs2Hazard = D_readsRs2 && (D_rs2Id == DE_rdId);
// we could generate slightly more bubble with
// simpler test (to be used if critical path is here)
// -> keeping this one (seems it has no influence on CPI,
// and results in slightly better timings)
// wire rs1Hazard = (D_rs1Id == DE_rdId);
// wire rs2Hazard = (D_rs2Id == DE_rdId);
// we are not obliged to compare all bits !
// wire rs1Hazard = (D_rs1Id[3:0] == DE_rdId[3:0]);
// wire rs2Hazard = (D_rs2Id[3:0] == DE_rdId[3:0]);
// Add bubble only if next instr uses result of latency-2 instr
wire dataHazard = !FD_nop && (DE_isLoad || DE_isCSRRS) &&
(rs1Hazard || rs2Hazard);
// (other option: always add bubble after latency-2 instr
// like Samsoniuk's DarkRiscV). Reduces critical path.
// wire dataHazard = !FD_nop && (DE_isLoad || DE_isCSRRS);
assign F_stall = dataHazard | halt;
assign D_stall = dataHazard | halt;
// Here we need to use E_JumpOrBranch (the registered version
// DE_JumpOrBranch is not ready on time).
assign D_flush = E_JumpOrBranch;
assign E_flush = E_JumpOrBranch | dataHazard;
/******************************************************************************/
`ifdef BENCH
always @(posedge clk) begin
if(halt) $finish();
end
`endif
/******************************************************************************/
endmodule
module SOC (
input CLK, // system clock
input RESET,// reset button
output reg [4:0] LEDS, // system LEDs
input RXD, // UART receive
output TXD // UART transmit
);
wire clk;
wire resetn;
wire [31:0] IO_mem_addr;
wire [31:0] IO_mem_rdata;
wire [31:0] IO_mem_wdata;
wire IO_mem_wr;
Processor CPU(
.clk(clk),
.resetn(resetn),
.IO_mem_addr(IO_mem_addr),
.IO_mem_rdata(IO_mem_rdata),
.IO_mem_wdata(IO_mem_wdata),
.IO_mem_wr(IO_mem_wr)
);
wire [13:0] IO_wordaddr = IO_mem_addr[15:2];
// Memory-mapped IO in IO page, 1-hot addressing in word address.
localparam IO_LEDS_bit = 0; // W five leds
localparam IO_UART_DAT_bit = 1; // W data to send (8 bits)
localparam IO_UART_CNTL_bit = 2; // R status. bit 9: busy sending
always @(posedge clk) begin
if(IO_mem_wr & IO_wordaddr[IO_LEDS_bit]) begin
LEDS <= IO_mem_wdata[4:0];
end
end
wire uart_valid = IO_mem_wr & IO_wordaddr[IO_UART_DAT_bit];
wire uart_ready;
corescore_emitter_uart #(
.clk_freq_hz(`CPU_FREQ*1000000),
.baud_rate(1000000)
) UART(
.i_clk(clk),
.i_rst(!resetn),
.i_data(IO_mem_wdata[7:0]),
.i_valid(uart_valid),
.o_ready(uart_ready),
.o_uart_tx(TXD)
);
assign IO_mem_rdata =
IO_wordaddr[IO_UART_CNTL_bit] ? { 22'b0, !uart_ready, 9'b0}
: 32'b0;
`ifdef BENCH
always @(posedge clk) begin
if(uart_valid) begin
$write("%c", IO_mem_wdata[7:0] );
$fflush(32'h8000_0001);
end
end
`endif
// Gearbox and reset circuitry.
Clockworks CW(
.CLK(CLK),
.RESET(RESET),
.clk(clk),
.resetn(resetn)
);
endmodule