mirror of
https://github.com/antonblanchard/microwatt.git
synced 2026-05-02 22:43:29 +00:00
execute1: Take an extra cycle for OE=1 multiply instructions
We now expect the overflow signal from the multiplier to come along one cycle later than the product. This breaks up a long combinatorial path and improves timing. This also changes some uses of v.<field> to r.<field> in the slow op logic, which should help timing as well. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
This commit is contained in:
@@ -56,6 +56,7 @@ architecture behaviour of execute1 is
|
|||||||
lr_update : std_ulogic;
|
lr_update : std_ulogic;
|
||||||
next_lr : std_ulogic_vector(63 downto 0);
|
next_lr : std_ulogic_vector(63 downto 0);
|
||||||
mul_in_progress : std_ulogic;
|
mul_in_progress : std_ulogic;
|
||||||
|
mul_finish : std_ulogic;
|
||||||
div_in_progress : std_ulogic;
|
div_in_progress : std_ulogic;
|
||||||
cntz_in_progress : std_ulogic;
|
cntz_in_progress : std_ulogic;
|
||||||
slow_op_insn : insn_type_t;
|
slow_op_insn : insn_type_t;
|
||||||
@@ -69,7 +70,7 @@ architecture behaviour of execute1 is
|
|||||||
constant reg_type_init : reg_type :=
|
constant reg_type_init : reg_type :=
|
||||||
(e => Execute1ToWritebackInit, f => Execute1ToFetch1Init,
|
(e => Execute1ToWritebackInit, f => Execute1ToFetch1Init,
|
||||||
busy => '0', lr_update => '0', terminate => '0',
|
busy => '0', lr_update => '0', terminate => '0',
|
||||||
mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0',
|
mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0',
|
||||||
slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
|
slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
|
||||||
next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0'));
|
next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0'));
|
||||||
|
|
||||||
@@ -371,6 +372,7 @@ begin
|
|||||||
v.mul_in_progress := '0';
|
v.mul_in_progress := '0';
|
||||||
v.div_in_progress := '0';
|
v.div_in_progress := '0';
|
||||||
v.cntz_in_progress := '0';
|
v.cntz_in_progress := '0';
|
||||||
|
v.mul_finish := '0';
|
||||||
|
|
||||||
-- signals to multiply and divide units
|
-- signals to multiply and divide units
|
||||||
sign1 := '0';
|
sign1 := '0';
|
||||||
@@ -965,31 +967,47 @@ begin
|
|||||||
when others =>
|
when others =>
|
||||||
-- i.e. OP_MUL_L64
|
-- i.e. OP_MUL_L64
|
||||||
result := multiply_to_x.result(63 downto 0);
|
result := multiply_to_x.result(63 downto 0);
|
||||||
overflow := multiply_to_x.overflow;
|
|
||||||
end case;
|
end case;
|
||||||
else
|
else
|
||||||
result := divider_to_x.write_reg_data;
|
result := divider_to_x.write_reg_data;
|
||||||
overflow := divider_to_x.overflow;
|
overflow := divider_to_x.overflow;
|
||||||
end if;
|
end if;
|
||||||
result_en := '1';
|
if r.mul_in_progress = '1' and r.slow_op_oe = '1' then
|
||||||
v.e.write_reg := gpr_to_gspr(v.slow_op_dest);
|
-- have to wait until next cycle for overflow indication
|
||||||
v.e.rc := v.slow_op_rc;
|
v.mul_finish := '1';
|
||||||
v.e.xerc := v.slow_op_xerc;
|
v.busy := '1';
|
||||||
v.e.write_xerc_enable := v.slow_op_oe;
|
else
|
||||||
-- We must test oe because the RC update code in writeback
|
result_en := '1';
|
||||||
-- will use the xerc value to set CR0:SO so we must not clobber
|
v.e.write_reg := gpr_to_gspr(r.slow_op_dest);
|
||||||
-- xerc if OE wasn't set.
|
v.e.rc := r.slow_op_rc;
|
||||||
if v.slow_op_oe = '1' then
|
v.e.xerc := r.slow_op_xerc;
|
||||||
v.e.xerc.ov := overflow;
|
v.e.write_xerc_enable := r.slow_op_oe;
|
||||||
v.e.xerc.ov32 := overflow;
|
-- We must test oe because the RC update code in writeback
|
||||||
v.e.xerc.so := v.slow_op_xerc.so or overflow;
|
-- will use the xerc value to set CR0:SO so we must not clobber
|
||||||
end if;
|
-- xerc if OE wasn't set.
|
||||||
v.e.valid := '1';
|
if r.slow_op_oe = '1' then
|
||||||
|
v.e.xerc.ov := overflow;
|
||||||
|
v.e.xerc.ov32 := overflow;
|
||||||
|
v.e.xerc.so := r.slow_op_xerc.so or overflow;
|
||||||
|
end if;
|
||||||
|
v.e.valid := '1';
|
||||||
|
end if;
|
||||||
else
|
else
|
||||||
v.busy := '1';
|
v.busy := '1';
|
||||||
v.mul_in_progress := r.mul_in_progress;
|
v.mul_in_progress := r.mul_in_progress;
|
||||||
v.div_in_progress := r.div_in_progress;
|
v.div_in_progress := r.div_in_progress;
|
||||||
end if;
|
end if;
|
||||||
|
elsif r.mul_finish = '1' then
|
||||||
|
result := r.e.write_data;
|
||||||
|
result_en := '1';
|
||||||
|
v.e.write_reg := gpr_to_gspr(r.slow_op_dest);
|
||||||
|
v.e.rc := r.slow_op_rc;
|
||||||
|
v.e.xerc := r.slow_op_xerc;
|
||||||
|
v.e.write_xerc_enable := r.slow_op_oe;
|
||||||
|
v.e.xerc.ov := multiply_to_x.overflow;
|
||||||
|
v.e.xerc.ov32 := multiply_to_x.overflow;
|
||||||
|
v.e.xerc.so := r.slow_op_xerc.so or multiply_to_x.overflow;
|
||||||
|
v.e.valid := '1';
|
||||||
end if;
|
end if;
|
||||||
|
|
||||||
if illegal = '1' then
|
if illegal = '1' then
|
||||||
|
|||||||
@@ -38,12 +38,15 @@ architecture behaviour of multiply is
|
|||||||
end record;
|
end record;
|
||||||
|
|
||||||
signal r, rin : reg_type := (multiply_pipeline => MultiplyPipelineInit);
|
signal r, rin : reg_type := (multiply_pipeline => MultiplyPipelineInit);
|
||||||
|
signal overflow : std_ulogic;
|
||||||
|
signal ovf_in : std_ulogic;
|
||||||
begin
|
begin
|
||||||
multiply_0: process(clk)
|
multiply_0: process(clk)
|
||||||
begin
|
begin
|
||||||
if rising_edge(clk) then
|
if rising_edge(clk) then
|
||||||
m <= m_in;
|
m <= m_in;
|
||||||
r <= rin;
|
r <= rin;
|
||||||
|
overflow <= ovf_in;
|
||||||
end if;
|
end if;
|
||||||
end process;
|
end process;
|
||||||
|
|
||||||
@@ -74,9 +77,10 @@ begin
|
|||||||
else
|
else
|
||||||
ov := (or d(127 downto 63)) and not (and d(127 downto 63));
|
ov := (or d(127 downto 63)) and not (and d(127 downto 63));
|
||||||
end if;
|
end if;
|
||||||
|
ovf_in <= ov;
|
||||||
|
|
||||||
m_out.result <= d;
|
m_out.result <= d;
|
||||||
m_out.overflow <= ov;
|
m_out.overflow <= overflow;
|
||||||
m_out.valid <= v.multiply_pipeline(PIPELINE_DEPTH-1).valid;
|
m_out.valid <= v.multiply_pipeline(PIPELINE_DEPTH-1).valid;
|
||||||
|
|
||||||
rin <= v;
|
rin <= v;
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ architecture behaviour of multiply is
|
|||||||
signal req_32bit, r32_1 : std_ulogic;
|
signal req_32bit, r32_1 : std_ulogic;
|
||||||
signal req_not, rnot_1 : std_ulogic;
|
signal req_not, rnot_1 : std_ulogic;
|
||||||
signal valid_1 : std_ulogic;
|
signal valid_1 : std_ulogic;
|
||||||
|
signal overflow, ovf_in : std_ulogic;
|
||||||
|
|
||||||
begin
|
begin
|
||||||
addend <= m_in.addend;
|
addend <= m_in.addend;
|
||||||
@@ -964,9 +965,10 @@ begin
|
|||||||
ov := not ((p1_pat and p0_pat and not product(31)) or
|
ov := not ((p1_pat and p0_pat and not product(31)) or
|
||||||
(p1_patb and p0_patb and product(31)));
|
(p1_patb and p0_patb and product(31)));
|
||||||
end if;
|
end if;
|
||||||
|
ovf_in <= ov;
|
||||||
|
|
||||||
m_out.result <= product;
|
m_out.result <= product;
|
||||||
m_out.overflow <= ov;
|
m_out.overflow <= overflow;
|
||||||
end process;
|
end process;
|
||||||
|
|
||||||
process(clk)
|
process(clk)
|
||||||
@@ -979,6 +981,7 @@ begin
|
|||||||
r32_1 <= m_in.is_32bit;
|
r32_1 <= m_in.is_32bit;
|
||||||
req_not <= rnot_1;
|
req_not <= rnot_1;
|
||||||
rnot_1 <= m_in.not_result;
|
rnot_1 <= m_in.not_result;
|
||||||
|
overflow <= ovf_in;
|
||||||
end if;
|
end if;
|
||||||
end process;
|
end process;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user