mirror of
https://github.com/antonblanchard/microwatt.git
synced 2026-01-11 23:43:15 +00:00
core: Add a short multiplier
This adds an optional 16 bit x 16 bit signed multiplier and uses it for multiply instructions that return the low 64 bits of the product (mull[dw][o] and mulli, but not maddld) when the operands are both in the range -2^15 .. 2^15 - 1. The "short" 16-bit multiplier produces its result combinatorially, so a multiply that uses it executes in one cycle. This improves the coremark result by about 4%, since coremark does quite a lot of multiplies and they almost all have operands that fit into 16 bits. The presence of the short multiplier is controlled by a generic at the execute1, SOC, core and top levels. For now, it defaults to off for all platforms, and can be enabled using the --has_short_mult flag to fusesoc. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
This commit is contained in:
parent
2224b28c2c
commit
734e4c4a52
@ -13,6 +13,7 @@ entity core is
|
||||
EX1_BYPASS : boolean := true;
|
||||
HAS_FPU : boolean := true;
|
||||
HAS_BTC : boolean := true;
|
||||
HAS_SHORT_MULT : boolean := false;
|
||||
ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0');
|
||||
LOG_LENGTH : natural := 512;
|
||||
ICACHE_NUM_LINES : natural := 64;
|
||||
@ -340,6 +341,7 @@ begin
|
||||
generic map (
|
||||
EX1_BYPASS => EX1_BYPASS,
|
||||
HAS_FPU => HAS_FPU,
|
||||
HAS_SHORT_MULT => HAS_SHORT_MULT,
|
||||
LOG_LENGTH => LOG_LENGTH
|
||||
)
|
||||
port map (
|
||||
|
||||
@ -14,6 +14,7 @@ entity execute1 is
|
||||
generic (
|
||||
EX1_BYPASS : boolean := true;
|
||||
HAS_FPU : boolean := true;
|
||||
HAS_SHORT_MULT : boolean := false;
|
||||
-- Non-zero to enable log data collection
|
||||
LOG_LENGTH : natural := 0
|
||||
);
|
||||
@ -95,6 +96,7 @@ architecture behaviour of execute1 is
|
||||
signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0);
|
||||
signal cr_in : std_ulogic_vector(31 downto 0);
|
||||
signal xerc_in : xer_common_t;
|
||||
signal mshort_p : std_ulogic_vector(31 downto 0) := (others => '0');
|
||||
|
||||
signal valid_in : std_ulogic;
|
||||
signal ctrl: ctrl_t := (others => (others => '0'));
|
||||
@ -230,6 +232,24 @@ architecture behaviour of execute1 is
|
||||
return msr_out;
|
||||
end;
|
||||
|
||||
-- Work out whether a signed value fits into n bits,
|
||||
-- that is, see if it is in the range -2^(n-1) .. 2^(n-1) - 1
|
||||
function fits_in_n_bits(val: std_ulogic_vector; n: integer) return boolean is
|
||||
variable x, xp1: std_ulogic_vector(val'left downto val'right);
|
||||
begin
|
||||
x := val;
|
||||
if val(val'left) = '0' then
|
||||
x := not val;
|
||||
end if;
|
||||
xp1 := bit_reverse(std_ulogic_vector(unsigned(bit_reverse(x)) + 1));
|
||||
x := x and not xp1;
|
||||
-- For positive inputs, x has ones at the positions
|
||||
-- to the left of the leftmost 1 bit in val.
|
||||
-- For negative inputs, x has ones to the left of
|
||||
-- the leftmost 0 bit in val.
|
||||
return x(n - 1) = '1';
|
||||
end;
|
||||
|
||||
-- Tell vivado to keep the hierarchy for the random module so that the
|
||||
-- net names in the xdc file match.
|
||||
attribute keep_hierarchy : string;
|
||||
@ -304,6 +324,17 @@ begin
|
||||
p_out => pmu_to_x
|
||||
);
|
||||
|
||||
short_mult_0: if HAS_SHORT_MULT generate
|
||||
begin
|
||||
short_mult: entity work.short_multiply
|
||||
port map (
|
||||
clk => clk,
|
||||
a_in => a_in(15 downto 0),
|
||||
b_in => b_in(15 downto 0),
|
||||
m_out => mshort_p
|
||||
);
|
||||
end generate;
|
||||
|
||||
dbg_msr_out <= ctrl.msr;
|
||||
log_rd_addr <= r.log_addr_spr;
|
||||
|
||||
@ -509,7 +540,11 @@ begin
|
||||
|
||||
case current.sub_select(1 downto 0) is
|
||||
when "00" =>
|
||||
muldiv_result <= multiply_to_x.result(63 downto 0);
|
||||
if HAS_SHORT_MULT and r.mul_in_progress = '0' then
|
||||
muldiv_result <= std_ulogic_vector(resize(signed(mshort_p), 64));
|
||||
else
|
||||
muldiv_result <= multiply_to_x.result(63 downto 0);
|
||||
end if;
|
||||
when "01" =>
|
||||
muldiv_result <= multiply_to_x.result(127 downto 64);
|
||||
when "10" =>
|
||||
@ -1121,10 +1156,20 @@ begin
|
||||
icache_inval <= '1';
|
||||
|
||||
when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 =>
|
||||
v.e.valid := '0';
|
||||
v.mul_in_progress := '1';
|
||||
v.busy := '1';
|
||||
x_to_multiply.valid <= '1';
|
||||
if HAS_SHORT_MULT and e_in.insn_type = OP_MUL_L64 and e_in.insn(26) = '1' and
|
||||
fits_in_n_bits(a_in, 16) and fits_in_n_bits(b_in, 16) then
|
||||
-- Operands fit into 16 bits, so use short multiplier
|
||||
if e_in.oe = '1' then
|
||||
-- Note 16x16 multiply can't overflow, even for mullwo
|
||||
set_ov(v.e, '0', '0');
|
||||
end if;
|
||||
else
|
||||
-- Use standard multiplier
|
||||
v.e.valid := '0';
|
||||
v.mul_in_progress := '1';
|
||||
v.busy := '1';
|
||||
x_to_multiply.valid <= '1';
|
||||
end if;
|
||||
|
||||
when OP_DIV | OP_DIVE | OP_MOD =>
|
||||
v.e.valid := '0';
|
||||
|
||||
@ -16,6 +16,7 @@ entity toplevel is
|
||||
CLK_FREQUENCY : positive := 100000000;
|
||||
HAS_FPU : boolean := true;
|
||||
HAS_BTC : boolean := true;
|
||||
HAS_SHORT_MULT : boolean := false;
|
||||
USE_LITEDRAM : boolean := false;
|
||||
NO_BRAM : boolean := false;
|
||||
DISABLE_FLATTEN_CORE : boolean := false;
|
||||
@ -194,6 +195,7 @@ begin
|
||||
CLK_FREQ => CLK_FREQUENCY,
|
||||
HAS_FPU => HAS_FPU,
|
||||
HAS_BTC => HAS_BTC,
|
||||
HAS_SHORT_MULT => HAS_SHORT_MULT,
|
||||
HAS_DRAM => USE_LITEDRAM,
|
||||
DRAM_SIZE => 256 * 1024 * 1024,
|
||||
DRAM_INIT_SIZE => PAYLOAD_SIZE,
|
||||
|
||||
@ -13,6 +13,7 @@ entity toplevel is
|
||||
CLK_FREQUENCY : positive := 100000000;
|
||||
HAS_FPU : boolean := true;
|
||||
HAS_BTC : boolean := false;
|
||||
HAS_SHORT_MULT: boolean := false;
|
||||
ICACHE_NUM_LINES : natural := 64;
|
||||
LOG_LENGTH : natural := 512;
|
||||
DISABLE_FLATTEN_CORE : boolean := false;
|
||||
@ -74,6 +75,7 @@ begin
|
||||
CLK_FREQ => CLK_FREQUENCY,
|
||||
HAS_FPU => HAS_FPU,
|
||||
HAS_BTC => HAS_BTC,
|
||||
HAS_SHORT_MULT => HAS_SHORT_MULT,
|
||||
ICACHE_NUM_LINES => ICACHE_NUM_LINES,
|
||||
LOG_LENGTH => LOG_LENGTH,
|
||||
DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE,
|
||||
|
||||
@ -16,6 +16,7 @@ entity toplevel is
|
||||
CLK_FREQUENCY : positive := 100000000;
|
||||
HAS_FPU : boolean := true;
|
||||
HAS_BTC : boolean := true;
|
||||
HAS_SHORT_MULT: boolean := false;
|
||||
USE_LITEDRAM : boolean := false;
|
||||
NO_BRAM : boolean := false;
|
||||
DISABLE_FLATTEN_CORE : boolean := false;
|
||||
@ -170,6 +171,7 @@ begin
|
||||
CLK_FREQ => CLK_FREQUENCY,
|
||||
HAS_FPU => HAS_FPU,
|
||||
HAS_BTC => HAS_BTC,
|
||||
HAS_SHORT_MULT=> HAS_SHORT_MULT,
|
||||
HAS_DRAM => USE_LITEDRAM,
|
||||
DRAM_SIZE => 512 * 1024 * 1024,
|
||||
DRAM_INIT_SIZE => PAYLOAD_SIZE,
|
||||
|
||||
@ -138,6 +138,7 @@ targets:
|
||||
- uart_is_16550
|
||||
- has_fpu
|
||||
- has_btc
|
||||
- has_short_mult
|
||||
tools:
|
||||
vivado: {part : xc7a100tcsg324-1}
|
||||
toplevel : toplevel
|
||||
@ -243,6 +244,7 @@ targets:
|
||||
- uart_is_16550
|
||||
- has_fpu
|
||||
- has_btc
|
||||
- has_short_mult
|
||||
generate: [litedram_nexys_video, liteeth_nexys_video, litesdcard_nexys_video]
|
||||
tools:
|
||||
vivado: {part : xc7a200tsbg484-1}
|
||||
@ -263,6 +265,7 @@ targets:
|
||||
- has_uart1
|
||||
- has_fpu=false
|
||||
- has_btc=false
|
||||
- has_short_mult
|
||||
- use_litesdcard
|
||||
tools:
|
||||
vivado: {part : xc7a35ticsg324-1L}
|
||||
@ -285,6 +288,7 @@ targets:
|
||||
- has_uart1
|
||||
- has_fpu=false
|
||||
- has_btc=false
|
||||
- has_short_mult
|
||||
generate: [litedram_arty, liteeth_arty, litesdcard_arty]
|
||||
tools:
|
||||
vivado: {part : xc7a35ticsg324-1L}
|
||||
@ -305,6 +309,7 @@ targets:
|
||||
- has_uart1
|
||||
- has_fpu
|
||||
- has_btc
|
||||
- has_short_mult
|
||||
- use_litesdcard
|
||||
tools:
|
||||
vivado: {part : xc7a100ticsg324-1L}
|
||||
@ -327,6 +332,7 @@ targets:
|
||||
- has_uart1
|
||||
- has_fpu
|
||||
- has_btc
|
||||
- has_short_mult
|
||||
generate: [litedram_arty, liteeth_arty, litesdcard_arty]
|
||||
tools:
|
||||
vivado: {part : xc7a100ticsg324-1L}
|
||||
@ -430,6 +436,12 @@ parameters:
|
||||
paramtype : generic
|
||||
default : true
|
||||
|
||||
has_short_mult:
|
||||
datatype : bool
|
||||
description : Include a 16 bit x 16 bit single-cycle multiplier in the core
|
||||
paramtype : generic
|
||||
default : false
|
||||
|
||||
disable_flatten_core:
|
||||
datatype : bool
|
||||
description : Prevent Vivado from flattening the main core components
|
||||
|
||||
@ -86,3 +86,22 @@ begin
|
||||
rin <= v;
|
||||
end process;
|
||||
end architecture behaviour;
|
||||
|
||||
library ieee;
|
||||
use ieee.std_logic_1164.all;
|
||||
use ieee.numeric_std.all;
|
||||
|
||||
entity short_multiply is
|
||||
port (
|
||||
clk : in std_ulogic;
|
||||
|
||||
a_in : in std_ulogic_vector(15 downto 0);
|
||||
b_in : in std_ulogic_vector(15 downto 0);
|
||||
m_out : out std_ulogic_vector(31 downto 0)
|
||||
);
|
||||
end entity short_multiply;
|
||||
|
||||
architecture behaviour of short_multiply is
|
||||
begin
|
||||
m_out <= std_ulogic_vector(signed(a_in) * signed(b_in));
|
||||
end architecture behaviour;
|
||||
|
||||
2
soc.vhdl
2
soc.vhdl
@ -59,6 +59,7 @@ entity soc is
|
||||
SIM : boolean;
|
||||
HAS_FPU : boolean := true;
|
||||
HAS_BTC : boolean := true;
|
||||
HAS_SHORT_MULT : boolean := false;
|
||||
DISABLE_FLATTEN_CORE : boolean := false;
|
||||
HAS_DRAM : boolean := false;
|
||||
DRAM_SIZE : integer := 0;
|
||||
@ -325,6 +326,7 @@ begin
|
||||
SIM => SIM,
|
||||
HAS_FPU => HAS_FPU,
|
||||
HAS_BTC => HAS_BTC,
|
||||
HAS_SHORT_MULT => HAS_SHORT_MULT,
|
||||
DISABLE_FLATTEN => DISABLE_FLATTEN_CORE,
|
||||
ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'),
|
||||
LOG_LENGTH => LOG_LENGTH,
|
||||
|
||||
@ -992,3 +992,84 @@ begin
|
||||
end process;
|
||||
|
||||
end architecture behaviour;
|
||||
|
||||
library ieee;
|
||||
use ieee.std_logic_1164.all;
|
||||
use ieee.numeric_std.all;
|
||||
|
||||
library unisim;
|
||||
use unisim.vcomponents.all;
|
||||
|
||||
entity short_multiply is
|
||||
port (
|
||||
clk : in std_logic;
|
||||
|
||||
a_in : in std_ulogic_vector(15 downto 0);
|
||||
b_in : in std_ulogic_vector(15 downto 0);
|
||||
m_out : out std_ulogic_vector(31 downto 0)
|
||||
);
|
||||
end entity short_multiply;
|
||||
|
||||
architecture behaviour of short_multiply is
|
||||
signal mshort_p : std_ulogic_vector(47 downto 0);
|
||||
begin
|
||||
mshort: DSP48E1
|
||||
generic map (
|
||||
ACASCREG => 0,
|
||||
ALUMODEREG => 0,
|
||||
AREG => 0,
|
||||
BCASCREG => 0,
|
||||
BREG => 0,
|
||||
CARRYINREG => 0,
|
||||
CARRYINSELREG => 0,
|
||||
CREG => 0,
|
||||
INMODEREG => 0,
|
||||
MREG => 0,
|
||||
OPMODEREG => 0,
|
||||
PREG => 0
|
||||
)
|
||||
port map (
|
||||
A => std_ulogic_vector(resize(signed(a_in(15 downto 0)), 30)),
|
||||
ACIN => (others => '0'),
|
||||
ALUMODE => "0000",
|
||||
B => std_ulogic_vector(resize(signed(b_in(15 downto 0)), 18)),
|
||||
BCIN => (others => '0'),
|
||||
C => 48x"0",
|
||||
CARRYCASCIN => '0',
|
||||
CARRYIN => '0',
|
||||
CARRYINSEL => "000",
|
||||
CEA1 => '0',
|
||||
CEA2 => '0',
|
||||
CEAD => '0',
|
||||
CEALUMODE => '0',
|
||||
CEB1 => '0',
|
||||
CEB2 => '0',
|
||||
CEC => '0',
|
||||
CECARRYIN => '0',
|
||||
CECTRL => '0',
|
||||
CED => '0',
|
||||
CEINMODE => '0',
|
||||
CEM => '0',
|
||||
CEP => '0',
|
||||
CLK => clk,
|
||||
D => (others => '0'),
|
||||
INMODE => "00000",
|
||||
MULTSIGNIN => '0',
|
||||
OPMODE => "0110101",
|
||||
P => mshort_p,
|
||||
PCIN => (others => '0'),
|
||||
RSTA => '0',
|
||||
RSTALLCARRYIN => '0',
|
||||
RSTALUMODE => '0',
|
||||
RSTB => '0',
|
||||
RSTC => '0',
|
||||
RSTCTRL => '0',
|
||||
RSTD => '0',
|
||||
RSTINMODE => '0',
|
||||
RSTM => '0',
|
||||
RSTP => '0'
|
||||
);
|
||||
|
||||
m_out <= mshort_p(31 downto 0);
|
||||
|
||||
end architecture behaviour;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user