mirror of
https://github.com/antonblanchard/microwatt.git
synced 2026-03-01 09:30:52 +00:00
core: Change bperm to a simpler and slower implementation
This does bperm in the bitsort unit instead of the logical unit, and no longer tries to do it in a single cycle with eight 64-to-1 multiplexers. Instead it is now a state machine in the bitsort unit, takes 8 cycles, and only has one 64-to-1 multiplexer. This helps improve timing and reduces LUT usage. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
This commit is contained in:
47
bitsort.vhdl
47
bitsort.vhdl
@@ -1,5 +1,6 @@
|
||||
-- Implements instructions that involve sorting bits,
|
||||
-- that is, cfuged, pextd and pdepd.
|
||||
-- Also does bperm, which is somewhat different.
|
||||
--
|
||||
-- cfuged: Sort the bits in the mask in RB into 0s at the left, 1s at the right
|
||||
-- and move the bits in RS in the same fashion to give the result
|
||||
@@ -7,6 +8,7 @@
|
||||
-- corresponding bit in RB is 1
|
||||
-- pdepd: Inverse of pextd; take the low-order bits of RS and spread them out
|
||||
-- to the bit positions which have a 1 in RB
|
||||
-- bperm: Select 8 arbitrary bits
|
||||
|
||||
-- NB opc is bits 7-6 of the instruction:
|
||||
-- 00 = pdepd, 01 = pextd, 10 = cfuged
|
||||
@@ -27,6 +29,8 @@ entity bit_sorter is
|
||||
go : in std_ulogic;
|
||||
opc : in std_ulogic_vector(1 downto 0);
|
||||
done : out std_ulogic;
|
||||
do_bperm : in std_ulogic;
|
||||
bperm_done : out std_ulogic;
|
||||
result : out std_ulogic_vector(63 downto 0)
|
||||
);
|
||||
end entity bit_sorter;
|
||||
@@ -45,6 +49,13 @@ architecture behaviour of bit_sorter is
|
||||
signal sr_vl : std_ulogic_vector(63 downto 0);
|
||||
signal sr_vr : std_ulogic_vector(63 downto 0);
|
||||
|
||||
signal is_bperm : std_ulogic;
|
||||
signal bpc : unsigned(2 downto 0);
|
||||
signal bp_done : std_ulogic;
|
||||
signal bperm_res : std_ulogic_vector(7 downto 0);
|
||||
signal rs_sr : std_ulogic_vector(63 downto 0);
|
||||
signal rb_bp : std_ulogic_vector(63 downto 0);
|
||||
|
||||
begin
|
||||
bsort_r: process(clk)
|
||||
begin
|
||||
@@ -96,7 +107,41 @@ begin
|
||||
end if;
|
||||
end process;
|
||||
|
||||
-- bit permutation
|
||||
bperm_res(7) <= rb_bp(to_integer(unsigned(not rs_sr(5 downto 0)))) when not is_X(rs_sr)
|
||||
else 'X';
|
||||
|
||||
bperm_r: process(clk)
|
||||
begin
|
||||
if rising_edge(clk) then
|
||||
if rst = '1' then
|
||||
is_bperm <= '0';
|
||||
bp_done <= '0';
|
||||
bperm_res(6 downto 0) <= (others => '0');
|
||||
bpc <= to_unsigned(0, 3);
|
||||
elsif do_bperm = '1' then
|
||||
is_bperm <= '1';
|
||||
bp_done <= '0';
|
||||
bperm_res(6 downto 0) <= (others => '0');
|
||||
bpc <= to_unsigned(0, 3);
|
||||
rs_sr <= rs;
|
||||
rb_bp <= rb;
|
||||
elsif bp_done = '1' then
|
||||
is_bperm <= '0';
|
||||
bp_done <= '0';
|
||||
elsif is_bperm = '1' then
|
||||
bperm_res(6 downto 0) <= bperm_res(7 downto 1);
|
||||
rs_sr <= x"00" & rs_sr(63 downto 8);
|
||||
if bpc = "110" then
|
||||
bp_done <= '1';
|
||||
end if;
|
||||
bpc <= bpc + 1;
|
||||
end if;
|
||||
end if;
|
||||
end process;
|
||||
|
||||
done <= sd;
|
||||
result <= val;
|
||||
bperm_done <= bp_done;
|
||||
result <= val when is_bperm = '0' else (56x"0" & bperm_res);
|
||||
|
||||
end behaviour;
|
||||
|
||||
@@ -227,7 +227,6 @@ architecture behaviour of decode2 is
|
||||
OP_PRTY => "001",
|
||||
OP_CMPB => "001",
|
||||
OP_EXTS => "001",
|
||||
OP_BPERM => "001",
|
||||
OP_BREV => "001",
|
||||
OP_BCD => "001",
|
||||
OP_MTSPR => "001",
|
||||
@@ -256,6 +255,7 @@ architecture behaviour of decode2 is
|
||||
OP_DIVE => "101",
|
||||
OP_MOD => "101",
|
||||
OP_BSORT => "100",
|
||||
OP_BPERM => "100",
|
||||
OP_ADDG6S => "001", -- misc_result
|
||||
OP_ISEL => "010",
|
||||
OP_DARN => "011",
|
||||
|
||||
@@ -116,6 +116,7 @@ architecture behaviour of execute1 is
|
||||
start_mul : std_ulogic;
|
||||
start_div : std_ulogic;
|
||||
start_bsort : std_ulogic;
|
||||
start_bperm : std_ulogic;
|
||||
do_trace : std_ulogic;
|
||||
ciabr_trace : std_ulogic;
|
||||
fp_intr : std_ulogic;
|
||||
@@ -150,6 +151,7 @@ architecture behaviour of execute1 is
|
||||
mul_finish : std_ulogic;
|
||||
div_in_progress : std_ulogic;
|
||||
bsort_in_progress : std_ulogic;
|
||||
bperm_in_progress : std_ulogic;
|
||||
no_instr_avail : std_ulogic;
|
||||
instr_dispatch : std_ulogic;
|
||||
ext_interrupt : std_ulogic;
|
||||
@@ -174,7 +176,7 @@ architecture behaviour of execute1 is
|
||||
spr_select => spr_id_init, pmu_spr_num => 5x"0",
|
||||
redir_to_next => '0', advance_nia => '0', lr_from_next => '0',
|
||||
mul_in_progress => '0', mul_finish => '0', div_in_progress => '0',
|
||||
bsort_in_progress => '0',
|
||||
bsort_in_progress => '0', bperm_in_progress => '0',
|
||||
no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0',
|
||||
taken_branch_event => '0', br_mispredict => '0',
|
||||
msr => 64x"0",
|
||||
@@ -245,6 +247,8 @@ architecture behaviour of execute1 is
|
||||
-- bit-sort unit signals
|
||||
signal bsort_start : std_ulogic;
|
||||
signal bsort_done : std_ulogic;
|
||||
signal bperm_start : std_ulogic;
|
||||
signal bperm_done : std_ulogic;
|
||||
|
||||
-- random number generator signals
|
||||
signal random_raw : std_ulogic_vector(63 downto 0);
|
||||
@@ -515,6 +519,8 @@ begin
|
||||
go => bsort_start,
|
||||
opc => e_in.insn(7 downto 6),
|
||||
done => bsort_done,
|
||||
do_bperm => bperm_start,
|
||||
bperm_done => bperm_done,
|
||||
result => bsort_result
|
||||
);
|
||||
|
||||
@@ -1228,7 +1234,7 @@ begin
|
||||
when OP_CMPRB =>
|
||||
when OP_CMPEQB =>
|
||||
when OP_LOGIC | OP_XOR | OP_PRTY | OP_CMPB | OP_EXTS |
|
||||
OP_BPERM | OP_BREV | OP_BCD =>
|
||||
OP_BREV | OP_BCD =>
|
||||
|
||||
when OP_B =>
|
||||
v.take_branch := '1';
|
||||
@@ -1433,6 +1439,11 @@ begin
|
||||
slow_op := '1';
|
||||
owait := '1';
|
||||
|
||||
when OP_BPERM =>
|
||||
v.start_bperm := '1';
|
||||
slow_op := '1';
|
||||
owait := '1';
|
||||
|
||||
when OP_MUL_L64 =>
|
||||
if e_in.is_32bit = '1' then
|
||||
v.se.mult_32s := '1';
|
||||
@@ -1718,6 +1729,7 @@ begin
|
||||
x_to_divider.valid <= actions.start_div;
|
||||
v.div_in_progress := actions.start_div;
|
||||
v.bsort_in_progress := actions.start_bsort;
|
||||
v.bperm_in_progress := actions.start_bperm;
|
||||
v.br_mispredict := v.e.redirect and actions.direct_branch;
|
||||
v.advance_nia := actions.advance_nia;
|
||||
v.redir_to_next := actions.redir_to_next;
|
||||
@@ -1728,7 +1740,8 @@ begin
|
||||
-- multiply is happening in order to stop following
|
||||
-- instructions from using the wrong XER value
|
||||
-- (and for simplicity in the OE=0 case).
|
||||
v.busy := actions.start_div or actions.start_mul or actions.start_bsort;
|
||||
v.busy := actions.start_div or actions.start_mul or
|
||||
actions.start_bsort or actions.start_bperm;
|
||||
|
||||
-- instruction for other units, i.e. LDST
|
||||
if e_in.unit = LDST then
|
||||
@@ -1740,6 +1753,7 @@ begin
|
||||
end if;
|
||||
is_scv := go and actions.se.scv_trap;
|
||||
bsort_start <= go and actions.start_bsort;
|
||||
bperm_start <= go and actions.start_bperm;
|
||||
pmu_trace <= go and actions.do_trace;
|
||||
|
||||
if not HAS_FPU and ex1.div_in_progress = '1' then
|
||||
@@ -1780,6 +1794,13 @@ begin
|
||||
v.e.write_data := alu_result;
|
||||
bypass_valid := bsort_done;
|
||||
end if;
|
||||
if ex1.bperm_in_progress = '1' then
|
||||
v.bperm_in_progress := not bperm_done;
|
||||
v.e.valid := bperm_done;
|
||||
v.busy := not bperm_done;
|
||||
v.e.write_data := alu_result;
|
||||
bypass_valid := bperm_done;
|
||||
end if;
|
||||
|
||||
if v.e.write_xerc_enable = '1' and v.e.valid = '1' then
|
||||
v.xerc := v.e.xerc;
|
||||
|
||||
13
logical.vhdl
13
logical.vhdl
@@ -23,7 +23,6 @@ architecture behaviour of logical is
|
||||
|
||||
signal par0, par1 : std_ulogic;
|
||||
signal parity : std_ulogic_vector(63 downto 0);
|
||||
signal permute : std_ulogic_vector(7 downto 0);
|
||||
|
||||
function bcd_to_dpd(bcd: std_ulogic_vector(11 downto 0)) return std_ulogic_vector is
|
||||
variable dpd: std_ulogic_vector(9 downto 0);
|
||||
@@ -109,16 +108,6 @@ begin
|
||||
parity(32) <= par1;
|
||||
end if;
|
||||
|
||||
-- bit permutation
|
||||
for i in 0 to 7 loop
|
||||
j := i * 8;
|
||||
if rs(j+7 downto j+6) = "00" then
|
||||
permute(i) <= rb(to_integer(unsigned(not rs(j+5 downto j))));
|
||||
else
|
||||
permute(i) <= '0';
|
||||
end if;
|
||||
end loop;
|
||||
|
||||
rb_adj := rb;
|
||||
if invert_in = '1' then
|
||||
rb_adj := not rb;
|
||||
@@ -157,8 +146,6 @@ begin
|
||||
tmp := parity;
|
||||
when OP_CMPB =>
|
||||
tmp := ppc_cmpb(rs, rb);
|
||||
when OP_BPERM =>
|
||||
tmp := std_ulogic_vector(resize(unsigned(permute), 64));
|
||||
when OP_BCD =>
|
||||
-- invert_in is abused to indicate direction of conversion
|
||||
if invert_in = '0' then
|
||||
|
||||
Reference in New Issue
Block a user