Branch prediction

- added DHRYSTONE test
   - cleaning sources, adding hit/miss reports in simulation
   - pipeline9.v: with return address stack
This commit is contained in:
Bruno Levy
2022-09-03 10:50:26 +02:00
parent 629b48b24e
commit 35368da0ef
9 changed files with 1928 additions and 4 deletions

View File

@@ -0,0 +1,425 @@
/*
****************************************************************************
*
* "DHRYSTONE" Benchmark Program
* -----------------------------
*
* Version: C, Version 2.1
*
* File: dhry.h (part 1 of 3)
*
* Date: May 25, 1988
*
* Author: Reinhold P. Weicker
* Siemens AG, AUT E 51
* Postfach 3220
* 8520 Erlangen
* Germany (West)
* Phone: [+49]-9131-7-20330
* (8-17 Central European Time)
* Usenet: ..!mcsun!unido!estevax!weicker
*
* Original Version (in Ada) published in
* "Communications of the ACM" vol. 27., no. 10 (Oct. 1984),
* pp. 1013 - 1030, together with the statistics
* on which the distribution of statements etc. is based.
*
* In this C version, the following C library functions are used:
* - strcpy, strcmp (inside the measurement loop)
* - printf, scanf (outside the measurement loop)
* In addition, Berkeley UNIX system calls "times ()" or "time ()"
* are used for execution time measurement. For measurements
* on other systems, these calls have to be changed.
*
* Collection of Results:
* Reinhold Weicker (address see above) and
*
* Rick Richardson
* PC Research. Inc.
* 94 Apple Orchard Drive
* Tinton Falls, NJ 07724
* Phone: (201) 389-8963 (9-17 EST)
* Usenet: ...!uunet!pcrat!rick
*
* Please send results to Rick Richardson and/or Reinhold Weicker.
* Complete information should be given on hardware and software used.
* Hardware information includes: Machine type, CPU, type and size
* of caches; for microprocessors: clock frequency, memory speed
* (number of wait states).
* Software information includes: Compiler (and runtime library)
* manufacturer and version, compilation switches, OS version.
* The Operating System version may give an indication about the
* compiler; Dhrystone itself performs no OS calls in the measurement loop.
*
* The complete output generated by the program should be mailed
* such that at least some checks for correctness can be made.
*
***************************************************************************
*
* History: This version C/2.1 has been made for two reasons:
*
* 1) There is an obvious need for a common C version of
* Dhrystone, since C is at present the most popular system
* programming language for the class of processors
* (microcomputers, minicomputers) where Dhrystone is used most.
* There should be, as far as possible, only one C version of
* Dhrystone such that results can be compared without
* restrictions. In the past, the C versions distributed
* by Rick Richardson (Version 1.1) and by Reinhold Weicker
* had small (though not significant) differences.
*
* 2) As far as it is possible without changes to the Dhrystone
* statistics, optimizing compilers should be prevented from
* removing significant statements.
*
* This C version has been developed in cooperation with
* Rick Richardson (Tinton Falls, NJ), it incorporates many
* ideas from the "Version 1.1" distributed previously by
* him over the UNIX network Usenet.
* I also thank Chaim Benedelac (National Semiconductor),
* David Ditzel (SUN), Earl Killian and John Mashey (MIPS),
* Alan Smith and Rafael Saavedra-Barrera (UC at Berkeley)
* for their help with comments on earlier versions of the
* benchmark.
*
* Changes: In the initialization part, this version follows mostly
* Rick Richardson's version distributed via Usenet, not the
* version distributed earlier via floppy disk by Reinhold Weicker.
* As a concession to older compilers, names have been made
* unique within the first 8 characters.
* Inside the measurement loop, this version follows the
* version previously distributed by Reinhold Weicker.
*
* At several places in the benchmark, code has been added,
* but within the measurement loop only in branches that
* are not executed. The intention is that optimizing compilers
* should be prevented from moving code out of the measurement
* loop, or from removing code altogether. Since the statements
* that are executed within the measurement loop have NOT been
* changed, the numbers defining the "Dhrystone distribution"
* (distribution of statements, operand types and locality)
* still hold. Except for sophisticated optimizing compilers,
* execution times for this version should be the same as
* for previous versions.
*
* Since it has proven difficult to subtract the time for the
* measurement loop overhead in a correct way, the loop check
* has been made a part of the benchmark. This does have
* an impact - though a very minor one - on the distribution
* statistics which have been updated for this version.
*
* All changes within the measurement loop are described
* and discussed in the companion paper "Rationale for
* Dhrystone version 2".
*
* Because of the self-imposed limitation that the order and
* distribution of the executed statements should not be
* changed, there are still cases where optimizing compilers
* may not generate code for some statements. To a certain
* degree, this is unavoidable for small synthetic benchmarks.
* Users of the benchmark are advised to check code listings
* whether code is generated for all statements of Dhrystone.
*
* Version 2.1 is identical to version 2.0 distributed via
* the UNIX network Usenet in March 1988 except that it corrects
* some minor deficiencies that were found by users of version 2.0.
* The only change within the measurement loop is that a
* non-executed "else" part was added to the "if" statement in
* Func_3, and a non-executed "else" part removed from Proc_3.
*
***************************************************************************
*
* Defines: The following "Defines" are possible:
* -DREG=register (default: Not defined)
* As an approximation to what an average C programmer
* might do, the "register" storage class is applied
* (if enabled by -DREG=register)
* - for local variables, if they are used (dynamically)
* five or more times
* - for parameters if they are used (dynamically)
* six or more times
* Note that an optimal "register" strategy is
* compiler-dependent, and that "register" declarations
* do not necessarily lead to faster execution.
* -DNOSTRUCTASSIGN (default: Not defined)
* Define if the C compiler does not support
* assignment of structures.
* -DNOENUMS (default: Not defined)
* Define if the C compiler does not support
* enumeration types.
* -DTIMES (default)
* -DTIME
* The "times" function of UNIX (returning process times)
* or the "time" function (returning wallclock time)
* is used for measurement.
* For single user machines, "time ()" is adequate. For
* multi-user machines where you cannot get single-user
* access, use the "times ()" function. If you have
* neither, use a stopwatch in the dead of night.
* "printf"s are provided marking the points "Start Timer"
* and "Stop Timer". DO NOT use the UNIX "time(1)"
* command, as this will measure the total time to
* run this program, which will (erroneously) include
* the time to allocate storage (malloc) and to perform
* the initialization.
* -DHZ=nnn
* In Berkeley UNIX, the function "times" returns process
* time in 1/HZ seconds, with HZ = 60 for most systems.
* CHECK YOUR SYSTEM DESCRIPTION BEFORE YOU JUST APPLY
* A VALUE.
*
***************************************************************************
*
* Compilation model and measurement (IMPORTANT):
*
* This C version of Dhrystone consists of three files:
* - dhry.h (this file, containing global definitions and comments)
* - dhry_1.c (containing the code corresponding to Ada package Pack_1)
* - dhry_2.c (containing the code corresponding to Ada package Pack_2)
*
* The following "ground rules" apply for measurements:
* - Separate compilation
* - No procedure merging
* - Otherwise, compiler optimizations are allowed but should be indicated
* - Default results are those without register declarations
* See the companion paper "Rationale for Dhrystone Version 2" for a more
* detailed discussion of these ground rules.
*
* For 16-Bit processors (e.g. 80186, 80286), times for all compilation
* models ("small", "medium", "large" etc.) should be given if possible,
* together with a definition of these models for the compiler system used.
*
**************************************************************************
*
* Dhrystone (C version) statistics:
*
* [Comment from the first distribution, updated for version 2.
* Note that because of language differences, the numbers are slightly
* different from the Ada version.]
*
* The following program contains statements of a high level programming
* language (here: C) in a distribution considered representative:
*
* assignments 52 (51.0 %)
* control statements 33 (32.4 %)
* procedure, function calls 17 (16.7 %)
*
* 103 statements are dynamically executed. The program is balanced with
* respect to the three aspects:
*
* - statement type
* - operand type
* - operand locality
* operand global, local, parameter, or constant.
*
* The combination of these three aspects is balanced only approximately.
*
* 1. Statement Type:
* ----------------- number
*
* V1 = V2 9
* (incl. V1 = F(..)
* V = Constant 12
* Assignment, 7
* with array element
* Assignment, 6
* with record component
* --
* 34 34
*
* X = Y +|-|"&&"|"|" Z 5
* X = Y +|-|"==" Constant 6
* X = X +|- 1 3
* X = Y *|/ Z 2
* X = Expression, 1
* two operators
* X = Expression, 1
* three operators
* --
* 18 18
*
* if .... 14
* with "else" 7
* without "else" 7
* executed 3
* not executed 4
* for ... 7 | counted every time
* while ... 4 | the loop condition
* do ... while 1 | is evaluated
* switch ... 1
* break 1
* declaration with 1
* initialization
* --
* 34 34
*
* P (...) procedure call 11
* user procedure 10
* library procedure 1
* X = F (...)
* function call 6
* user function 5
* library function 1
* --
* 17 17
* ---
* 103
*
* The average number of parameters in procedure or function calls
* is 1.82 (not counting the function values as implicit parameters).
*
*
* 2. Operators
* ------------
* number approximate
* percentage
*
* Arithmetic 32 50.8
*
* + 21 33.3
* - 7 11.1
* * 3 4.8
* / (int div) 1 1.6
*
* Comparison 27 42.8
*
* == 9 14.3
* /= 4 6.3
* > 1 1.6
* < 3 4.8
* >= 1 1.6
* <= 9 14.3
*
* Logic 4 6.3
*
* && (AND-THEN) 1 1.6
* | (OR) 1 1.6
* ! (NOT) 2 3.2
*
* -- -----
* 63 100.1
*
*
* 3. Operand Type (counted once per operand reference):
* ---------------
* number approximate
* percentage
*
* Integer 175 72.3 %
* Character 45 18.6 %
* Pointer 12 5.0 %
* String30 6 2.5 %
* Array 2 0.8 %
* Record 2 0.8 %
* --- -------
* 242 100.0 %
*
* When there is an access path leading to the final operand (e.g. a record
* component), only the final data type on the access path is counted.
*
*
* 4. Operand Locality:
* -------------------
* number approximate
* percentage
*
* local variable 114 47.1 %
* global variable 22 9.1 %
* parameter 45 18.6 %
* value 23 9.5 %
* reference 22 9.1 %
* function result 6 2.5 %
* constant 55 22.7 %
* --- -------
* 242 100.0 %
*
*
* The program does not compute anything meaningful, but it is syntactically
* and semantically correct. All variables have a value assigned to them
* before they are used as a source operand.
*
* There has been no explicit effort to account for the effects of a
* cache, or to balance the use of long or short displacements for code or
* data.
*
***************************************************************************
*/
#pragma once
/* Compiler and system dependent definitions: */
#ifndef TIME
#define TIMES
#endif
/* Use times(2) time function unless */
/* explicitly defined otherwise */
#ifdef TIMES
#include <sys/types.h>
#include <sys/times.h>
/* for "times" */
#endif
#define Mic_secs_Per_Second 80000000.0
/* Berkeley UNIX C returns process times in seconds/HZ */
#ifdef NOSTRUCTASSIGN
#define structassign(d, s) memcpy(&(d), &(s), sizeof(d))
#else
#define structassign(d, s) d = s
#endif
#ifdef NOENUM
#define Ident_1 0
#define Ident_2 1
#define Ident_3 2
#define Ident_4 3
#define Ident_5 4
typedef int Enumeration;
#else
typedef enum {Ident_1, Ident_2, Ident_3, Ident_4, Ident_5}
Enumeration;
#endif
/* for boolean and enumeration types in Ada, Pascal */
/* General definitions: */
//#include <stdio.h>
/* for strcpy, strcmp */
#define Null 0
/* Value of a Null pointer */
#define true 1
#define false 0
typedef int One_Thirty;
typedef int One_Fifty;
typedef char Capital_Letter;
typedef int Boolean;
typedef char Str_30 [31];
typedef int Arr_1_Dim [50];
typedef int Arr_2_Dim [50] [50];
typedef struct record
{
struct record *Ptr_Comp;
Enumeration Discr;
union {
struct {
Enumeration Enum_Comp;
int Int_Comp;
char Str_Comp [31];
} var_1;
struct {
Enumeration E_Comp_2;
char Str_2_Comp [31];
} var_2;
struct {
char Ch_1_Comp;
char Ch_2_Comp;
} var_3;
} variant;
} Rec_Type, *Rec_Pointer;

View File

@@ -0,0 +1,465 @@
/*
****************************************************************************
*
* "DHRYSTONE" Benchmark Program
* -----------------------------
*
* Version: C, Version 2.1
*
* File: dhry_1.c (part 2 of 3)
*
* Date: May 25, 1988
*
* Author: Reinhold P. Weicker
*
****************************************************************************
*/
#include "dhry.h"
#ifdef USE_MYSTDLIB
extern char *malloc ();
#else
# include <stdlib.h>
# include <string.h>
#endif
/* Global Variables: */
Rec_Pointer Ptr_Glob,
Next_Ptr_Glob;
int Int_Glob;
Boolean Bool_Glob;
char Ch_1_Glob,
Ch_2_Glob;
int Arr_1_Glob [50];
int Arr_2_Glob [50] [50];
Enumeration Func_1 ();
/* forward declaration necessary since Enumeration may not simply be int */
#ifndef REG
Boolean Reg = false;
#define REG
/* REG becomes defined as empty */
/* i.e. no register variables */
#else
Boolean Reg = true;
#endif
/* variables for time measurement: */
#ifdef IGN_TIMES
#define HZ 50000000
struct tms time_info;
extern time_t times ();
/* see library function "times" */
#define Too_Small_Time 120
/* Measurements should last at least about 2 seconds */
#endif
#ifdef TIME
extern long time();
#ifdef RISCV
extern long insn();
#endif
/* see library function "time" */
#define Too_Small_Time 2
/* Measurements should last at least 2 seconds */
#endif
long Begin_Time,
End_Time,
User_Time;
#ifdef RISCV
long Begin_Insn,
End_Insn,
User_Insn;
#endif
float Microseconds,
Dhrystones_Per_Second;
/* end of variables for time measurement */
main ()
/*****/
/* main program, corresponds to procedures */
/* Main and Proc_0 in the Ada version */
{
One_Fifty Int_1_Loc;
REG One_Fifty Int_2_Loc;
One_Fifty Int_3_Loc;
REG char Ch_Index;
Enumeration Enum_Loc;
Str_30 Str_1_Loc;
Str_30 Str_2_Loc;
REG int Run_Index;
REG int Number_Of_Runs;
Rec_Type R1,R2;
/* Initializations */
/*
* FEMTOSOC/FEMTORV32 modifications ===========================
*/
/*
* Since there are only two calls to malloc(), and that malloc()
* is not supported yet by femtosoc lib, I replaced them with
* pre-allocated structures.
*/
Next_Ptr_Glob = &R1; // (Rec_Pointer) malloc (sizeof (Rec_Type));
Ptr_Glob = &R2; // (Rec_Pointer) malloc (sizeof (Rec_Type));
/*
* Initialize IO (redirect to UART or OLED screen depending on
* femtosoc.v configuration).
*/
// femtosoc_tty_init();
/*
* Verify that this core was synthetized with counters.
* The generation script extracts configuration
* from femtosoc.v and writes values at specific memory addresses.
* See stubs.c and LIB/femtorv32.h
*/
if(!has_counters()) {
printf("This femtorv32 core does not have counters (see femtosoc.v)");
return -1;
}
/*
* End of FEMTOSOC/FEMTORV32 modifications ======================
*/
Ptr_Glob->Ptr_Comp = Next_Ptr_Glob;
Ptr_Glob->Discr = Ident_1;
Ptr_Glob->variant.var_1.Enum_Comp = Ident_3;
Ptr_Glob->variant.var_1.Int_Comp = 40;
strcpy (Ptr_Glob->variant.var_1.Str_Comp,
"DHRYSTONE PROGRAM, SOME STRING");
strcpy (Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING");
Arr_2_Glob [8][7] = 10;
/* Was missing in published program. Without this statement, */
/* Arr_2_Glob [8][7] would have an undefined value. */
/* Warning: With 16-Bit processors and Number_Of_Runs > 32000, */
/* overflow may occur for this array element. */
printf ("\n");
printf ("Dhrystone Benchmark, Version 2.1 (Language: C)\n");
printf ("\n");
if (Reg)
{
printf ("Program compiled with 'register' attribute\n");
printf ("\n");
}
else
{
printf ("Program compiled without 'register' attribute\n");
printf ("\n");
}
printf ("Please give the number of runs through the benchmark: ");
{
// int n;
// scanf ("%d", &n);
Number_Of_Runs = 100;
}
printf ("\n");
printf ("Execution starts, %d runs through Dhrystone\n", Number_Of_Runs);
/***************/
/* Start timer */
/***************/
#ifdef IGN_TIMES
times (&time_info);
Begin_Time = (long) time_info.tms_utime;
#endif
#ifdef TIME
Begin_Time = time ( (long *) 0);
#ifdef RISCV
Begin_Insn = insn ( (long *) 0);
#endif
#endif
for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index)
{
Proc_5();
Proc_4();
/* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */
Int_1_Loc = 2;
Int_2_Loc = 3;
strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
Enum_Loc = Ident_2;
Bool_Glob = ! Func_2 (Str_1_Loc, Str_2_Loc);
/* Bool_Glob == 1 */
while (Int_1_Loc < Int_2_Loc) /* loop body executed once */
{
Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc;
/* Int_3_Loc == 7 */
Proc_7 (Int_1_Loc, Int_2_Loc, &Int_3_Loc);
/* Int_3_Loc == 7 */
Int_1_Loc += 1;
} /* while */
/* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
Proc_8 (Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc);
/* Int_Glob == 5 */
Proc_1 (Ptr_Glob);
for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index)
/* loop body executed twice */
{
if (Enum_Loc == Func_1 (Ch_Index, 'C'))
/* then, not executed */
{
Proc_6 (Ident_1, &Enum_Loc);
strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING");
Int_2_Loc = Run_Index;
Int_Glob = Run_Index;
}
}
/* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
Int_2_Loc = Int_2_Loc * Int_1_Loc;
Int_1_Loc = Int_2_Loc / Int_3_Loc;
Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc;
/* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */
Proc_2 (&Int_1_Loc);
/* Int_1_Loc == 5 */
} /* loop "for Run_Index" */
/**************/
/* Stop timer */
/**************/
#ifdef IGN_TIMES
times (&time_info);
End_Time = (long) time_info.tms_utime;
#endif
#ifdef TIME
End_Time = time ( (long *) 0);
#ifdef RISCV
End_Insn = insn ( (long *) 0);
#endif
#endif
printf ("Execution ends\n");
printf ("\n");
printf ("Final values of the variables used in the benchmark:\n");
printf ("\n");
printf ("Int_Glob: %d\n", Int_Glob);
printf (" should be: %d\n", 5);
printf ("Bool_Glob: %d\n", Bool_Glob);
printf (" should be: %d\n", 1);
printf ("Ch_1_Glob: %c\n", Ch_1_Glob);
printf (" should be: %c\n", 'A');
printf ("Ch_2_Glob: %c\n", Ch_2_Glob);
printf (" should be: %c\n", 'B');
printf ("Arr_1_Glob[8]: %d\n", Arr_1_Glob[8]);
printf (" should be: %d\n", 7);
printf ("Arr_2_Glob[8][7]: %d\n", Arr_2_Glob[8][7]);
printf (" should be: Number_Of_Runs + 10\n");
printf ("Ptr_Glob->\n");
printf (" Ptr_Comp: %d\n", (int) Ptr_Glob->Ptr_Comp);
printf (" should be: (implementation-dependent)\n");
printf (" Discr: %d\n", Ptr_Glob->Discr);
printf (" should be: %d\n", 0);
printf (" Enum_Comp: %d\n", Ptr_Glob->variant.var_1.Enum_Comp);
printf (" should be: %d\n", 2);
printf (" Int_Comp: %d\n", Ptr_Glob->variant.var_1.Int_Comp);
printf (" should be: %d\n", 17);
printf (" Str_Comp: %s\n", Ptr_Glob->variant.var_1.Str_Comp);
printf (" should be: DHRYSTONE PROGRAM, SOME STRING\n");
printf ("Next_Ptr_Glob->\n");
printf (" Ptr_Comp: %d\n", (int) Next_Ptr_Glob->Ptr_Comp);
printf (" should be: (implementation-dependent), same as above\n");
printf (" Discr: %d\n", Next_Ptr_Glob->Discr);
printf (" should be: %d\n", 0);
printf (" Enum_Comp: %d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);
printf (" should be: %d\n", 1);
printf (" Int_Comp: %d\n", Next_Ptr_Glob->variant.var_1.Int_Comp);
printf (" should be: %d\n", 18);
printf (" Str_Comp: %s\n",
Next_Ptr_Glob->variant.var_1.Str_Comp);
printf (" should be: DHRYSTONE PROGRAM, SOME STRING\n");
printf ("Int_1_Loc: %d\n", Int_1_Loc);
printf (" should be: %d\n", 5);
printf ("Int_2_Loc: %d\n", Int_2_Loc);
printf (" should be: %d\n", 13);
printf ("Int_3_Loc: %d\n", Int_3_Loc);
printf (" should be: %d\n", 7);
printf ("Enum_Loc: %d\n", Enum_Loc);
printf (" should be: %d\n", 1);
printf ("Str_1_Loc: %s\n", Str_1_Loc);
printf (" should be: DHRYSTONE PROGRAM, 1'ST STRING\n");
printf ("Str_2_Loc: %s\n", Str_2_Loc);
printf (" should be: DHRYSTONE PROGRAM, 2'ND STRING\n");
printf ("\n");
User_Time = End_Time - Begin_Time;
#ifdef RISCV
User_Insn = End_Insn - Begin_Insn;
printf("Number_Of_Runs: %d\n", Number_Of_Runs);
printf("User_Time: %d cycles, %d insn\n", User_Time, User_Insn);
int Cycles_Per_Instruction_x1000 = (1000 * User_Time) / User_Insn;
printf("Cycles_Per_Instruction: %d.%d%d%d\n", Cycles_Per_Instruction_x1000 / 1000,
(Cycles_Per_Instruction_x1000 / 100) % 10,
(Cycles_Per_Instruction_x1000 / 10) % 10,
(Cycles_Per_Instruction_x1000 / 1) % 10);
int Dhrystones_Per_Second_Per_MHz = (Number_Of_Runs * 1000000) / User_Time;
printf("Dhrystones_Per_Second_Per_MHz: %d\n", Dhrystones_Per_Second_Per_MHz);
/*
* "Another common representation of the Dhrystone benchmark is the DMIPS (Dhrystone MIPS) obtained
* when the Dhrystone score is divided by 1757 (the number of Dhrystones per second obtained on the
* VAX 11/780, nominally a 1 MIPS machine)."
*/
int DMIPS_Per_MHz_x1000 = (1000 * Dhrystones_Per_Second_Per_MHz) / 1757;
printf("DMIPS_Per_MHz: %d.%d%d%d\n", DMIPS_Per_MHz_x1000 / 1000,
(DMIPS_Per_MHz_x1000 / 100) % 10,
(DMIPS_Per_MHz_x1000 / 10) % 10,
(DMIPS_Per_MHz_x1000 / 1) % 10);
#else
if (User_Time < Too_Small_Time)
{
printf ("Measured time too small to obtain meaningful results\n");
printf ("Please increase number of runs\n");
printf ("\n");
}
else
{
#ifdef TIME
Microseconds = (float) User_Time * Mic_secs_Per_Second
/ (float) Number_Of_Runs;
Dhrystones_Per_Second = (float) Number_Of_Runs / (float) User_Time;
#else
Microseconds = (float) User_Time * Mic_secs_Per_Second
/ ((float) HZ * ((float) Number_Of_Runs));
Dhrystones_Per_Second = ((float) HZ * (float) Number_Of_Runs)
/ (float) User_Time;
#endif
printf ("Microseconds for one run through Dhrystone: ");
printf ("%6.1f \n", Microseconds);
printf ("Dhrystones per Second: ");
printf ("%6.1f \n", Dhrystones_Per_Second);
printf ("\n");
}
#endif
return 0;
}
Proc_1 (Ptr_Val_Par)
/******************/
REG Rec_Pointer Ptr_Val_Par;
/* executed once */
{
REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp;
/* == Ptr_Glob_Next */
/* Local variable, initialized with Ptr_Val_Par->Ptr_Comp, */
/* corresponds to "rename" in Ada, "with" in Pascal */
structassign (*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob);
Ptr_Val_Par->variant.var_1.Int_Comp = 5;
Next_Record->variant.var_1.Int_Comp
= Ptr_Val_Par->variant.var_1.Int_Comp;
Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp;
Proc_3 (&Next_Record->Ptr_Comp);
/* Ptr_Val_Par->Ptr_Comp->Ptr_Comp
== Ptr_Glob->Ptr_Comp */
if (Next_Record->Discr == Ident_1)
/* then, executed */
{
Next_Record->variant.var_1.Int_Comp = 6;
Proc_6 (Ptr_Val_Par->variant.var_1.Enum_Comp,
&Next_Record->variant.var_1.Enum_Comp);
Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp;
Proc_7 (Next_Record->variant.var_1.Int_Comp, 10,
&Next_Record->variant.var_1.Int_Comp);
}
else /* not executed */
structassign (*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp);
} /* Proc_1 */
Proc_2 (Int_Par_Ref)
/******************/
/* executed once */
/* *Int_Par_Ref == 1, becomes 4 */
One_Fifty *Int_Par_Ref;
{
One_Fifty Int_Loc;
Enumeration Enum_Loc;
Int_Loc = *Int_Par_Ref + 10;
do /* executed once */
if (Ch_1_Glob == 'A')
/* then, executed */
{
Int_Loc -= 1;
*Int_Par_Ref = Int_Loc - Int_Glob;
Enum_Loc = Ident_1;
} /* if */
while (Enum_Loc != Ident_1); /* true */
} /* Proc_2 */
Proc_3 (Ptr_Ref_Par)
/******************/
/* executed once */
/* Ptr_Ref_Par becomes Ptr_Glob */
Rec_Pointer *Ptr_Ref_Par;
{
if (Ptr_Glob != Null)
/* then, executed */
*Ptr_Ref_Par = Ptr_Glob->Ptr_Comp;
Proc_7 (10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp);
} /* Proc_3 */
Proc_4 () /* without parameters */
/*******/
/* executed once */
{
Boolean Bool_Loc;
Bool_Loc = Ch_1_Glob == 'A';
Bool_Glob = Bool_Loc | Bool_Glob;
Ch_2_Glob = 'B';
} /* Proc_4 */
Proc_5 () /* without parameters */
/*******/
/* executed once */
{
Ch_1_Glob = 'A';
Bool_Glob = false;
} /* Proc_5 */
/* Procedure for the assignment of structures, */
/* if the C compiler doesn't support this feature */
#ifdef NOSTRUCTASSIGN
memcpy (d, s, l)
register char *d;
register char *s;
register int l;
{
while (l--) *d++ = *s++;
}
#endif

View File

@@ -0,0 +1,192 @@
/*
****************************************************************************
*
* "DHRYSTONE" Benchmark Program
* -----------------------------
*
* Version: C, Version 2.1
*
* File: dhry_2.c (part 3 of 3)
*
* Date: May 25, 1988
*
* Author: Reinhold P. Weicker
*
****************************************************************************
*/
#include "dhry.h"
#ifndef REG
#define REG
/* REG becomes defined as empty */
/* i.e. no register variables */
#endif
extern int Int_Glob;
extern char Ch_1_Glob;
Proc_6 (Enum_Val_Par, Enum_Ref_Par)
/*********************************/
/* executed once */
/* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */
Enumeration Enum_Val_Par;
Enumeration *Enum_Ref_Par;
{
*Enum_Ref_Par = Enum_Val_Par;
if (! Func_3 (Enum_Val_Par))
/* then, not executed */
*Enum_Ref_Par = Ident_4;
switch (Enum_Val_Par)
{
case Ident_1:
*Enum_Ref_Par = Ident_1;
break;
case Ident_2:
if (Int_Glob > 100)
/* then */
*Enum_Ref_Par = Ident_1;
else *Enum_Ref_Par = Ident_4;
break;
case Ident_3: /* executed */
*Enum_Ref_Par = Ident_2;
break;
case Ident_4: break;
case Ident_5:
*Enum_Ref_Par = Ident_3;
break;
} /* switch */
} /* Proc_6 */
Proc_7 (Int_1_Par_Val, Int_2_Par_Val, Int_Par_Ref)
/**********************************************/
/* executed three times */
/* first call: Int_1_Par_Val == 2, Int_2_Par_Val == 3, */
/* Int_Par_Ref becomes 7 */
/* second call: Int_1_Par_Val == 10, Int_2_Par_Val == 5, */
/* Int_Par_Ref becomes 17 */
/* third call: Int_1_Par_Val == 6, Int_2_Par_Val == 10, */
/* Int_Par_Ref becomes 18 */
One_Fifty Int_1_Par_Val;
One_Fifty Int_2_Par_Val;
One_Fifty *Int_Par_Ref;
{
One_Fifty Int_Loc;
Int_Loc = Int_1_Par_Val + 2;
*Int_Par_Ref = Int_2_Par_Val + Int_Loc;
} /* Proc_7 */
Proc_8 (Arr_1_Par_Ref, Arr_2_Par_Ref, Int_1_Par_Val, Int_2_Par_Val)
/*********************************************************************/
/* executed once */
/* Int_Par_Val_1 == 3 */
/* Int_Par_Val_2 == 7 */
Arr_1_Dim Arr_1_Par_Ref;
Arr_2_Dim Arr_2_Par_Ref;
int Int_1_Par_Val;
int Int_2_Par_Val;
{
REG One_Fifty Int_Index;
REG One_Fifty Int_Loc;
Int_Loc = Int_1_Par_Val + 5;
Arr_1_Par_Ref [Int_Loc] = Int_2_Par_Val;
Arr_1_Par_Ref [Int_Loc+1] = Arr_1_Par_Ref [Int_Loc];
Arr_1_Par_Ref [Int_Loc+30] = Int_Loc;
for (Int_Index = Int_Loc; Int_Index <= Int_Loc+1; ++Int_Index)
Arr_2_Par_Ref [Int_Loc] [Int_Index] = Int_Loc;
Arr_2_Par_Ref [Int_Loc] [Int_Loc-1] += 1;
Arr_2_Par_Ref [Int_Loc+20] [Int_Loc] = Arr_1_Par_Ref [Int_Loc];
Int_Glob = 5;
} /* Proc_8 */
Enumeration Func_1 (Ch_1_Par_Val, Ch_2_Par_Val)
/*************************************************/
/* executed three times */
/* first call: Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R' */
/* second call: Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C' */
/* third call: Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C' */
Capital_Letter Ch_1_Par_Val;
Capital_Letter Ch_2_Par_Val;
{
Capital_Letter Ch_1_Loc;
Capital_Letter Ch_2_Loc;
Ch_1_Loc = Ch_1_Par_Val;
Ch_2_Loc = Ch_1_Loc;
if (Ch_2_Loc != Ch_2_Par_Val)
/* then, executed */
return (Ident_1);
else /* not executed */
{
Ch_1_Glob = Ch_1_Loc;
return (Ident_2);
}
} /* Func_1 */
Boolean Func_2 (Str_1_Par_Ref, Str_2_Par_Ref)
/*************************************************/
/* executed once */
/* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */
/* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */
Str_30 Str_1_Par_Ref;
Str_30 Str_2_Par_Ref;
{
REG One_Thirty Int_Loc;
Capital_Letter Ch_Loc;
Int_Loc = 2;
while (Int_Loc <= 2) /* loop body executed once */
if (Func_1 (Str_1_Par_Ref[Int_Loc],
Str_2_Par_Ref[Int_Loc+1]) == Ident_1)
/* then, executed */
{
Ch_Loc = 'A';
Int_Loc += 1;
} /* if, while */
if (Ch_Loc >= 'W' && Ch_Loc < 'Z')
/* then, not executed */
Int_Loc = 7;
if (Ch_Loc == 'R')
/* then, not executed */
return (true);
else /* executed */
{
if (strcmp (Str_1_Par_Ref, Str_2_Par_Ref) > 0)
/* then, not executed */
{
Int_Loc += 7;
Int_Glob = Int_Loc;
return (true);
}
else /* executed */
return (false);
} /* if Ch_Loc */
} /* Func_2 */
Boolean Func_3 (Enum_Par_Val)
/***************************/
/* executed once */
/* Enum_Par_Val == Ident_3 */
Enumeration Enum_Par_Val;
{
Enumeration Enum_Loc;
Enum_Loc = Enum_Par_Val;
if (Enum_Loc == Ident_3)
/* then, executed */
return (true);
else /* not executed */
return (false);
} /* Func_3 */

View File

@@ -0,0 +1,34 @@
#include <perf.h>
long time() {
return rdcycle();
}
long insn() {
return rdinstret();
}
int has_counters() {
return 1;
}
char *strcpy(char *dest, const char *src) {
char* result = dest;
while(*dest++=*src++);
return result;
}
int strcmp (const char *p1, const char *p2) {
const unsigned char *s1 = (const unsigned char *) p1;
const unsigned char *s2 = (const unsigned char *) p2;
unsigned char c1, c2;
do {
c1 = (unsigned char) *s1++;
c2 = (unsigned char) *s2++;
if (c1 == '\0') {
return c1 - c2;
}
}
while (c1 == c2);
return c1 - c2;
}

View File

@@ -1,6 +1,6 @@
include ../../../FIRMWARE/makefile.inc
RVASFLAGS=-march=$(ARCH) -mabi=$(ABI)
RVCFLAGS=-Os -fno-pic -march=$(ARCH) -mabi=$(ABI) -fno-stack-protector -w -Wl,--no-relax
RVCFLAGS=-I. -O2 -fno-pic -march=$(ARCH) -mabi=$(ABI) -fno-stack-protector -w -Wl,--no-relax
RAM_SIZE=6144

View File

@@ -0,0 +1,7 @@
#define RISCV
#define TIME
#define USE_MYSTDLIB
#include "DHRYSTONE/dhry_1.c"
#include "DHRYSTONE/dhry_2.c"
#include "DHRYSTONE/stubs.c"

View File

@@ -1,5 +1,5 @@
/*
* pipeline6.v
* pipeline7.v
* Let us see how to morph our multi-cycle CPU into a pipelined CPU !
* Step 7: a flavor of branch prediction
* static branch prediction

View File

@@ -1,5 +1,5 @@
/*
* pipeline6.v
* pipeline8.v
* Let us see how to morph our multi-cycle CPU into a pipelined CPU !
* Step 8: dynamic branch prediction
*/
@@ -610,7 +610,7 @@ module Processor (
if(halt) begin
$display("Simulated processor's report");
$display("----------------------------");
$display("Pred hits = %3.3f\%%",
$display("Branch hits= %3.3f\%%",
nbPredictHit*100.0/nbBranch );
$display("CPI = %3.3f",(cycle*1.0)/(instret*1.0));
$display("Instr. mix = (Branch:%3.3f\%% JAL:%3.3f\%% JALR:%3.3f\%%)",

View File

@@ -0,0 +1,801 @@
/*
* pipeline9.v
* Let us see how to morph our multi-cycle CPU into a pipelined CPU !
* Step 9: return address stack
*/
`default_nettype none
`include "clockworks.v"
`include "emitter_uart.v"
//`define VERBOSE
/******************************************************************************/
module Processor (
input clk,
input resetn,
output [31:0] IO_mem_addr, // IO memory address
input [31:0] IO_mem_rdata, // data read from IO memory
output [31:0] IO_mem_wdata, // data written to IO memory
output IO_mem_wr // IO write flag
);
`include "riscv_disassembly.v"
/******************************************************************************/
/*
Reminder for the 10 RISC-V codeops
----------------------------------
ALUreg // rd <- rs1 OP rs2
ALUimm // rd <- rs1 OP Iimm
Branch // if(rs1 OP rs2) PC<-PC+Bimm
JALR // rd <- PC+4; PC<-rs1+Iimm
JAL // rd <- PC+4; PC<-PC+Jimm
AUIPC // rd <- PC + Uimm
LUI // rd <- Uimm
Load // rd <- mem[rs1+Iimm]
Store // mem[rs1+Simm] <- rs2
SYSTEM // special
*/
/******************************************************************************/
/* Instruction decoder as functions (we will use them several times) */
/* The 10 "recognizers" for the 10 codeops */
function isALUreg; input [31:0] I; isALUreg=(I[6:0]==7'b0110011); endfunction
function isALUimm; input [31:0] I; isALUimm=(I[6:0]==7'b0010011); endfunction
function isBranch; input [31:0] I; isBranch=(I[6:0]==7'b1100011); endfunction
function isJALR; input [31:0] I; isJALR =(I[6:0]==7'b1100111); endfunction
function isJAL; input [31:0] I; isJAL =(I[6:0]==7'b1101111); endfunction
function isAUIPC; input [31:0] I; isAUIPC =(I[6:0]==7'b0010111); endfunction
function isLUI; input [31:0] I; isLUI =(I[6:0]==7'b0110111); endfunction
function isLoad; input [31:0] I; isLoad =(I[6:0]==7'b0000011); endfunction
function isStore; input [31:0] I; isStore =(I[6:0]==7'b0100011); endfunction
function isSYSTEM; input [31:0] I; isSYSTEM=(I[6:0]==7'b1110011); endfunction
/* Register indices */
function [4:0] rs1Id; input [31:0] I; rs1Id = I[19:15]; endfunction
function [4:0] rs2Id; input [31:0] I; rs2Id = I[24:20]; endfunction
function [4:0] shamt; input [31:0] I; shamt = I[24:20]; endfunction
function [4:0] rdId; input [31:0] I; rdId = I[11:7]; endfunction
function [1:0] csrId; input [31:0] I; csrId = {I[27],I[21]}; endfunction
/* funct3 and funct7 */
function [2:0] funct3; input [31:0] I; funct3 = I[14:12]; endfunction
function [6:0] funct7; input [31:0] I; funct7 = I[31:25]; endfunction
/* EBREAK and CSRRS instruction "recognizers" */
function isEBREAK;
input [31:0] I;
isEBREAK = (isSYSTEM(I) && funct3(I) == 3'b000);
endfunction
function isCSRRS;
input [31:0] I;
isCSRRS = (isSYSTEM(I) && funct3(I) == 3'b010);
endfunction
/* The 5 immediate formats */
function [31:0] Uimm;
input [31:0] I;
Uimm={I[31:12],{12{1'b0}}};
endfunction
function [31:0] Iimm;
input [31:0] I;
Iimm={{21{I[31]}},I[30:20]};
endfunction
function [31:0] Simm;
input [31:0] I;
Simm={{21{I[31]}},I[30:25],I[11:7]};
endfunction
function [31:0] Bimm;
input [31:0] I;
Bimm = {{20{I[31]}},I[7],I[30:25],I[11:8],1'b0};
endfunction
function [31:0] Jimm;
input [31:0] I;
Jimm = {{12{I[31]}},I[19:12],I[20],I[30:21],1'b0};
endfunction
function writesRd;
input [31:0] I;
writesRd = !isStore(I) && !isBranch(I);
endfunction
function readsRs1;
input [31:0] I;
readsRs1 = !(isJAL(I) || isAUIPC(I) || isLUI(I));
endfunction
function readsRs2;
input [31:0] I;
readsRs2 = isALUreg(I) || isBranch(I) || isStore(I);
endfunction
/******************************************************************************/
reg [63:0] cycle;
reg [63:0] instret;
always @(posedge clk) begin
cycle <= !resetn ? 0 : cycle + 1;
end
wire D_flush;
wire E_flush;
wire F_stall;
wire D_stall;
wire halt; // Halt execution (on ebreak)
/******************************************************************************/
localparam NOP = 32'b0000000_00000_00000_000_00000_0110011;
/*** F: Instruction fetch ***/
reg [31:0] PC;
reg [31:0] PROGROM[0:16383]; // 16384 4-bytes words
// 64 Kb of program ROM
initial begin
$readmemh("PROGROM.hex",PROGROM);
end
// Note: E's jumpOrBranch signals are registered in EM (1 cycle later),
// hence taken into account in F_PC mux (1 cycle before). Doing so
// avoids a *huge* critical path (that generates E_JumpOrBranch, that
// uses the ALU branch result E_takeBranch, and hence that comprises
// register forwarding & ALU)
wire [31:0] F_PC =
D_JumpOrBranchNow ? D_JumpOrBranchAddr :
EM_JumpOrBranchNow ? EM_JumpOrBranchAddr :
PC;
always @(posedge clk) begin
if(!F_stall) begin
FD_instr <= PROGROM[F_PC[15:2]];
FD_PC <= F_PC;
PC <= F_PC+4;
end
// Cannot write NOP to FD_instr, because
// whenever a BRAM read is involved, do
// nothing else than sending the result
// to a reg.
FD_nop <= D_flush | !resetn;
if(!resetn) begin
PC <= 0;
end
end
/******************************************************************************/
reg [31:0] FD_PC;
reg [31:0] FD_instr;
reg FD_nop;
/******************************************************************************/
/*** D: Instruction decode ***/
// Branch prediction
// 83% success with HISTO_BITS=8, ADDR_BITS=12
// *** 80% success with HISTO_BITS=5, ADDR_BITS=10
// 78% success with HISTO_BITS=4, ADDR_BITS=8
localparam BP_HISTO_BITS=5;
localparam BP_ADDR_BITS=10;
localparam BP_SIZE=1<<BP_ADDR_BITS;
reg [BP_HISTO_BITS-1:0] PHT[BP_SIZE-1:0]; // Pattern History Table
reg [1:0] BHT[BP_SIZE-1:0]; // Branch History Table
function [BP_ADDR_BITS-1:0] PHT_index;
input [31:0] PC;
PHT_index = PC[BP_ADDR_BITS+1:2]; // pshare
//PHT_index = 0; // gshare
endfunction
function [BP_ADDR_BITS-1:0] BHT_index;
input [31:0] PC;
// Choose indexing for dynamic branch prediction
// (uncomment one of the following choices)
// Used if D_predictBranch is set to dynamic (later in this file)
// 1: simple 2-bits counter without history
// BHT_index = PHT_index(PC);
// 2: gselect
// /* verilator lint_off WIDTH */
// BHT_index = {PHT_index(PC), PHT[PHT_index(PC)]};
// /* verilator lint_on WIDTH */
// 3: pshare/gshare
BHT_index = PHT_index(PC) ^
{PHT[PHT_index(PC)],{BP_ADDR_BITS-BP_HISTO_BITS{1'b0}}};
endfunction
// Choose branch prediction strategy
// (uncomment one of the following choices)
//wire D_predictBranch = 1'd0; // 1. predict not taken
//wire D_predictBranch = 1'd1; // 2. predict taken
//wire D_predictBranch = FD_instr[31]; // 3. BTFNT
wire D_predictBranch = BHT[BHT_index(FD_PC)][1]; // 4. dynamic
// Next fetch gets address from JAL target or from Branch target
// if branch is predicted.
wire D_JumpOrBranchNow = !FD_nop && (
isJAL(FD_instr) ||
(isBranch(FD_instr) && D_predictBranch) ||
isJALR(FD_instr)
);
// Return address stack
reg [31:0] RAS_0;
reg [31:0] RAS_1;
reg [31:0] RAS_2;
reg [31:0] RAS_3;
wire [31:0] D_JumpOrBranchAddr =
isJALR(FD_instr) ? RAS_0 :
(FD_PC + (isJAL(FD_instr) ? Jimm(FD_instr) : Bimm(FD_instr)));
/** These three signals come from the Writeback stage **/
wire wbEnable;
wire [31:0] wbData;
wire [4:0] wbRdId;
reg [31:0] RegisterBank [0:31];
always @(posedge clk) begin
if(!D_stall) begin
DE_PC <= FD_PC;
DE_instr <= (E_flush | FD_nop) ? NOP : FD_instr;
DE_predictBranch <= D_predictBranch;
DE_predictRA <= RAS_0;
DE_PHTindex <= PHT_index(FD_PC);
DE_BHTindex <= BHT_index(FD_PC);
if(!FD_nop) begin
if(isJAL(FD_instr)) begin
RAS_3 <= RAS_2;
RAS_2 <= RAS_1;
RAS_1 <= RAS_0;
RAS_0 <= FD_PC + 4;
// $display("push(%0h)",FD_PC+4);
end else if(isJALR(FD_instr)) begin
// $display("pop()");
RAS_0 <= RAS_1;
RAS_1 <= RAS_2;
RAS_2 <= RAS_3;
end
end
end
if(E_flush) begin
DE_instr <= NOP;
end
if(wbEnable) begin
RegisterBank[wbRdId] <= wbData;
end
end
/******************************************************************************/
reg [31:0] DE_PC;
reg [31:0] DE_instr;
wire [31:0] DE_rs1 = RegisterBank[rs1Id(DE_instr)];
wire [31:0] DE_rs2 = RegisterBank[rs2Id(DE_instr)];
reg DE_predictBranch;
reg [31:0] DE_predictRA;
reg [BP_ADDR_BITS-1:0] DE_PHTindex;
reg [BP_ADDR_BITS-1:0] DE_BHTindex;
/******************************************************************************/
/*** E: Execute ***/
/*********** Registrer forwarding ************************************/
wire E_M_fwd_rs1 = rdId(EM_instr) != 0 && writesRd(EM_instr) &&
(rdId(EM_instr) == rs1Id(DE_instr));
wire E_W_fwd_rs1 = rdId(MW_instr) != 0 && writesRd(MW_instr) &&
(rdId(MW_instr) == rs1Id(DE_instr));
wire E_M_fwd_rs2 = rdId(EM_instr) != 0 && writesRd(EM_instr) &&
(rdId(EM_instr) == rs2Id(DE_instr));
wire E_W_fwd_rs2 = rdId(MW_instr) != 0 && writesRd(MW_instr) &&
(rdId(MW_instr) == rs2Id(DE_instr));
wire [31:0] E_rs1 = E_M_fwd_rs1 ? EM_Eresult :
E_W_fwd_rs1 ? wbData :
DE_rs1;
wire [31:0] E_rs2 = E_M_fwd_rs2 ? EM_Eresult :
E_W_fwd_rs2 ? wbData :
DE_rs2;
/*********** the ALU *************************************************/
wire [31:0] E_aluIn1 = E_rs1;
wire [31:0] E_aluIn2 =
(isALUreg(DE_instr) | isBranch(DE_instr)) ? E_rs2 : Iimm(DE_instr);
wire [4:0] E_shamt = isALUreg(DE_instr) ? E_rs2[4:0] : shamt(DE_instr);
wire E_minus = DE_instr[30] & isALUreg(DE_instr);
wire E_arith_shift = DE_instr[30];
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] E_aluPlus = E_aluIn1 + E_aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] E_aluMinus = {1'b1, ~E_aluIn2} + {1'b0,E_aluIn1} + 33'b1;
wire E_LT =
(E_aluIn1[31] ^ E_aluIn2[31]) ? E_aluIn1[31] : E_aluMinus[32];
wire E_LTU = E_aluMinus[32];
wire E_EQ = (E_aluMinus[31:0] == 0);
// Flip a 32 bit word. Used by the shifter (a single shifter for
// left and right shifts, saves silicium !)
function [31:0] flip32;
input [31:0] x;
flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7],
x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15],
x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
endfunction
wire [31:0] E_shifter_in =
(funct3(DE_instr)==3'b001) ? flip32(E_aluIn1) : E_aluIn1;
/* verilator lint_off WIDTH */
wire [31:0] E_shifter =
$signed({E_arith_shift & E_aluIn1[31], E_shifter_in}) >>> E_aluIn2[4:0];
/* verilator lint_on WIDTH */
wire [31:0] E_leftshift = flip32(E_shifter);
reg [31:0] E_aluOut;
always @(*) begin
case(funct3(DE_instr))
3'b000: E_aluOut = E_minus ? E_aluMinus[31:0] : E_aluPlus;
3'b001: E_aluOut = E_leftshift;
3'b010: E_aluOut = {31'b0, E_LT};
3'b011: E_aluOut = {31'b0, E_LTU};
3'b100: E_aluOut = E_aluIn1 ^ E_aluIn2;
3'b101: E_aluOut = E_shifter;
3'b110: E_aluOut = E_aluIn1 | E_aluIn2;
3'b111: E_aluOut = E_aluIn1 & E_aluIn2;
endcase
end
/*********** Branch, JAL, JALR ***********************************/
reg E_takeBranch;
always @(*) begin
case (funct3(DE_instr))
3'b000: E_takeBranch = E_EQ;
3'b001: E_takeBranch = !E_EQ;
3'b100: E_takeBranch = E_LT;
3'b101: E_takeBranch = !E_LT;
3'b110: E_takeBranch = E_LTU;
3'b111: E_takeBranch = !E_LTU;
default: E_takeBranch = 1'b0;
endcase
end
// Jump if mispredicted branch or JALR
`ifdef BENCH
integer nbBranch = 0;
integer nbBranchHit = 0;
integer nbJAL = 0;
integer nbJALR = 0;
integer nbJALRhit = 0;
`endif
function [1:0] incdec_sat;
input [1:0] prev;
input dir;
// incdec_sat = dir ? 2'b11 : 2'b00; // simple binary instead of bimodal
incdec_sat =
{dir, prev} == 3'b000 ? 2'b00 :
{dir, prev} == 3'b000 ? 2'b00 :
{dir, prev} == 3'b001 ? 2'b00 :
{dir, prev} == 3'b010 ? 2'b01 :
{dir, prev} == 3'b011 ? 2'b10 :
{dir, prev} == 3'b100 ? 2'b01 :
{dir, prev} == 3'b101 ? 2'b10 :
{dir, prev} == 3'b110 ? 2'b11 :
2'b11 ;
endfunction;
wire [31:0] E_JALRaddr = {E_aluPlus[31:1],1'b0};
wire E_JumpOrBranch = (
(isJALR(DE_instr) && (DE_predictRA != E_JALRaddr)) ||
(isBranch(DE_instr) && (E_takeBranch^DE_predictBranch))
);
wire [31:0] E_JumpOrBranchAddr =
isBranch(DE_instr) ?
(DE_PC + (DE_predictBranch ? 4 : Bimm(DE_instr))) :
/* JALR */ E_JALRaddr ;
wire [31:0] E_result =
(isJAL(DE_instr) | isJALR(DE_instr)) ? DE_PC+4 :
isLUI(DE_instr) ? Uimm(DE_instr) :
isAUIPC(DE_instr) ? DE_PC + Uimm(DE_instr) :
E_aluOut ;
/**************************************************************/
always @(posedge clk) begin
//if(isJALR(DE_instr)) begin
// $display("JALR predict %0h effective %0h", DE_predictRA, E_JALRaddr);
//end
EM_PC <= DE_PC;
EM_instr <= DE_instr;
EM_rs2 <= E_rs2;
EM_Eresult <= E_result;
EM_addr <= isStore(DE_instr) ? E_rs1 + Simm(DE_instr) :
E_rs1 + Iimm(DE_instr) ;
EM_JumpOrBranchNow <= E_JumpOrBranch;
EM_JumpOrBranchAddr <= E_JumpOrBranchAddr;
if(isBranch(DE_instr)) begin
PHT[DE_PHTindex] <= { PHT[DE_PHTindex][BP_HISTO_BITS-2:0],
E_takeBranch };
BHT[DE_BHTindex] <= incdec_sat(BHT[DE_BHTindex], E_takeBranch);
end
end
`ifdef BENCH
always @(posedge clk) begin
if(resetn) begin
if(isBranch(DE_instr)) begin
nbBranch <= nbBranch + 1;
if(E_takeBranch == DE_predictBranch) begin
nbBranchHit <= nbBranchHit + 1;
end
end
if(isJAL(DE_instr)) begin
nbJAL <= nbJAL + 1;
end
if(isJALR(DE_instr)) begin
nbJALR <= nbJALR + 1;
if(DE_predictRA == E_JALRaddr) begin
nbJALRhit <= nbJALRhit + 1;
end
end
end
end
`endif
assign halt = resetn & isEBREAK(DE_instr);
/******************************************************************************/
reg [31:0] EM_PC;
reg [31:0] EM_instr;
reg [31:0] EM_rs2;
reg [31:0] EM_Eresult;
reg [31:0] EM_addr;
reg EM_JumpOrBranchNow;
reg [31:0] EM_JumpOrBranchAddr;
/******************************************************************************/
/*** M: Memory ***/
wire [2:0] M_funct3 = funct3(EM_instr);
wire M_isB = (M_funct3[1:0] == 2'b00);
wire M_isH = (M_funct3[1:0] == 2'b01);
/*************** STORE **************************/
wire [31:0] M_STORE_data;
assign M_STORE_data[ 7: 0] = EM_rs2[7:0];
assign M_STORE_data[15: 8] = EM_addr[0] ? EM_rs2[7:0] : EM_rs2[15: 8] ;
assign M_STORE_data[23:16] = EM_addr[1] ? EM_rs2[7:0] : EM_rs2[23:16] ;
assign M_STORE_data[31:24] = EM_addr[0] ? EM_rs2[7:0] :
EM_addr[1] ? EM_rs2[15:8] : EM_rs2[31:24] ;
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on EM_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on EM_addr[1:0])
wire [3:0] M_STORE_wmask = M_isB ?
(EM_addr[1] ?
(EM_addr[0] ? 4'b1000 : 4'b0100) :
(EM_addr[0] ? 4'b0010 : 4'b0001)
) :
M_isH ? (EM_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111 ;
wire M_isIO = EM_addr[22];
wire M_isRAM = !M_isIO;
assign IO_mem_addr = EM_addr;
assign IO_mem_wr = isStore(EM_instr) && M_isIO; // && M_STORE_wmask[0];
assign IO_mem_wdata = EM_rs2;
wire [3:0] M_wmask = {4{isStore(EM_instr) & M_isRAM}} & M_STORE_wmask;
reg [31:0] DATARAM [0:16383]; // 16384 4-bytes words
// 64 Kb of data RAM in total
wire [13:0] M_word_addr = EM_addr[15:2];
always @(posedge clk) begin
MW_Mdata <= DATARAM[M_word_addr];
if(M_wmask[0]) DATARAM[M_word_addr][ 7:0 ] <= M_STORE_data[ 7:0 ];
if(M_wmask[1]) DATARAM[M_word_addr][15:8 ] <= M_STORE_data[15:8 ];
if(M_wmask[2]) DATARAM[M_word_addr][23:16] <= M_STORE_data[23:16];
if(M_wmask[3]) DATARAM[M_word_addr][31:24] <= M_STORE_data[31:24];
end
initial begin
$readmemh("DATARAM.hex",DATARAM);
end
always @(posedge clk) begin
MW_PC <= EM_PC;
MW_instr <= EM_instr;
MW_Eresult <= EM_Eresult;
MW_IOresult <= IO_mem_rdata;
MW_addr <= EM_addr;
case(csrId(EM_instr))
2'b00: MW_CSRresult = cycle[31:0];
2'b10: MW_CSRresult = cycle[63:32];
2'b01: MW_CSRresult = instret[31:0];
2'b11: MW_CSRresult = instret[63:32];
endcase
if(!resetn) begin
instret <= 0;
end else if(MW_instr != NOP) begin
instret <= instret + 1;
end
end
/******************************************************************************/
reg [31:0] MW_PC;
reg [31:0] MW_instr;
reg [31:0] MW_Eresult;
reg [31:0] MW_addr;
reg [31:0] MW_Mdata;
reg [31:0] MW_IOresult;
reg [31:0] MW_CSRresult;
/******************************************************************************/
/*** W: WriteBack ***/
wire [2:0] W_funct3 = funct3(MW_instr);
wire W_isB = (W_funct3[1:0] == 2'b00);
wire W_isH = (W_funct3[1:0] == 2'b01);
wire W_sext = !W_funct3[2];
wire W_isIO = MW_addr[22];
/*************** LOAD ****************************/
wire [15:0] W_LOAD_H=MW_addr[1] ? MW_Mdata[31:16]: MW_Mdata[15:0];
wire [7:0] W_LOAD_B=MW_addr[0] ? W_LOAD_H[15:8] : W_LOAD_H[7:0];
wire W_LOAD_sign=W_sext & (W_isB ? W_LOAD_B[7] : W_LOAD_H[15]);
wire [31:0] W_Mresult = W_isB ? {{24{W_LOAD_sign}},W_LOAD_B} :
W_isH ? {{16{W_LOAD_sign}},W_LOAD_H} :
MW_Mdata ;
assign wbData =
isLoad(MW_instr) ? (W_isIO ? MW_IOresult : W_Mresult) :
isCSRRS(MW_instr) ? MW_CSRresult :
MW_Eresult;
assign wbEnable = writesRd(MW_instr) && rdId(MW_instr) != 0;
assign wbRdId = rdId(MW_instr);
/******************************************************************************/
// Not testing that rdId(DE_instr) != 0 because in general one
// does not Load to zero ! (idem for CSRRS).
wire rs1Hazard = readsRs1(FD_instr) && (rs1Id(FD_instr) == rdId(DE_instr)) ;
wire rs2Hazard = readsRs2(FD_instr) && (rs2Id(FD_instr) == rdId(DE_instr)) ;
wire dataHazard = !FD_nop &&
(isLoad(DE_instr)||isCSRRS(DE_instr)) &&
(rs1Hazard || rs2Hazard);
assign F_stall = dataHazard | halt;
assign D_stall = dataHazard | halt;
assign D_flush = E_JumpOrBranch;
assign E_flush = E_JumpOrBranch | dataHazard;
/******************************************************************************/
`ifdef BENCH
/* verilator lint_off WIDTH */
always @(posedge clk) begin
if(halt) begin
$display("Simulated processor's report");
$display("----------------------------");
$display("Branch hit = %3.3f\%%",
nbBranchHit*100.0/nbBranch );
$display("JALR hit = %3.3f\%%",
nbJALRhit*100.0/nbJALR );
$display("CPI = %3.3f",(cycle*1.0)/(instret*1.0));
$display("Instr. mix = (Branch:%3.3f\%% JAL:%3.3f\%% JALR:%3.3f\%%)",
nbBranch*100.0/instret,
nbJAL*100.0/instret,
nbJALR*100.0/instret);
$finish();
end
end
/* verilator lint_on WIDTH */
`endif
`ifdef VERBOSE
always @(posedge clk) begin
if(resetn & !halt) begin
$write("D_JoB=%d E_JoB=%d D_flush=%d E_flush=%d\n",
D_JumpOrBranchNow, EM_JumpOrBranchNow, D_flush, E_flush
);
$write("[W] PC=%h ", MW_PC);
$write(" ");
riscv_disasm(MW_instr,MW_PC);
if(wbEnable) $write(" x%0d <- 0x%0h",rdId(MW_instr),wbData);
$write("\n");
$write("[M] PC=%h ", EM_PC);
$write(" ");
riscv_disasm(EM_instr,EM_PC);
$write("\n");
$write("[E] PC=%h ", DE_PC);
$write(" ");
riscv_disasm(DE_instr,DE_PC);
if(DE_instr != NOP) begin
$write(" rs1=0x%h rs2=0x%h ",DE_rs1, DE_rs2);
if(isBranch(DE_instr)) begin
$write(" taken:%0d prediction OK:%0d",
E_takeBranch,
(E_takeBranch == DE_predictBranch) ? 1 : 0
);
end
end
$write("\n");
$write("[D] PC=%h ", FD_PC);
$write("[%s%s] ",
dataHazard && rs1Hazard?"*":" ",
dataHazard && rs2Hazard?"*":" ");
riscv_disasm(FD_nop ? NOP : FD_instr,FD_PC);
if(isBranch(FD_instr)) begin
$write(" predict taken:%0d",D_predictBranch);
end
$write("\n");
$write("[F] PC=%h ", F_PC);
if(D_JumpOrBranchNow) $write(" PC <- [D] 0x%0h",D_JumpOrBranchAddr);
if(EM_JumpOrBranchNow) $write(" PC <- [E] 0x%0h",EM_JumpOrBranchAddr);
$write("\n");
$display("");
end
end
`endif
/******************************************************************************/
endmodule
module SOC (
input CLK, // system clock
input RESET,// reset button
output reg [4:0] LEDS, // system LEDs
input RXD, // UART receive
output TXD // UART transmit
);
wire clk;
wire resetn;
wire [31:0] IO_mem_addr;
wire [31:0] IO_mem_rdata;
wire [31:0] IO_mem_wdata;
wire IO_mem_wr;
Processor CPU(
.clk(clk),
.resetn(resetn),
.IO_mem_addr(IO_mem_addr),
.IO_mem_rdata(IO_mem_rdata),
.IO_mem_wdata(IO_mem_wdata),
.IO_mem_wr(IO_mem_wr)
);
wire [13:0] IO_wordaddr = IO_mem_addr[15:2];
// Memory-mapped IO in IO page, 1-hot addressing in word address.
localparam IO_LEDS_bit = 0; // W five leds
localparam IO_UART_DAT_bit = 1; // W data to send (8 bits)
localparam IO_UART_CNTL_bit = 2; // R status. bit 9: busy sending
always @(posedge clk) begin
if(IO_mem_wr & IO_wordaddr[IO_LEDS_bit]) begin
LEDS <= IO_mem_wdata[4:0];
end
end
wire uart_valid = IO_mem_wr & IO_wordaddr[IO_UART_DAT_bit];
wire uart_ready;
corescore_emitter_uart #(
.clk_freq_hz(`CPU_FREQ*1000000),
.baud_rate(1000000)
) UART(
.i_clk(clk),
.i_rst(!resetn),
.i_data(IO_mem_wdata[7:0]),
.i_valid(uart_valid),
.o_ready(uart_ready),
.o_uart_tx(TXD)
);
assign IO_mem_rdata =
IO_wordaddr[IO_UART_CNTL_bit] ? { 22'b0, !uart_ready, 9'b0}
: 32'b0;
`ifdef BENCH
always @(posedge clk) begin
if(uart_valid) begin
`ifdef VERBOSE
$display("UART: %c", IO_mem_wdata[7:0]);
`else
$write("%c", IO_mem_wdata[7:0] );
$fflush(32'h8000_0001);
`endif
end
end
`endif
// Gearbox and reset circuitry.
Clockworks CW(
.CLK(CLK),
.RESET(RESET),
.clk(clk),
.resetn(resetn)
);
endmodule