mirror of
https://github.com/antonblanchard/microwatt.git
synced 2026-01-11 23:43:15 +00:00
Merge pull request #345 from antonblanchard/popcnt-go-fast
popcnt* timing improvements from Paul
This commit is contained in:
commit
cef3660e74
2
Makefile
2
Makefile
@ -60,7 +60,7 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
|
||||
decode1.vhdl helpers.vhdl insn_helpers.vhdl \
|
||||
control.vhdl decode2.vhdl register_file.vhdl \
|
||||
cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
|
||||
logical.vhdl countzero.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
|
||||
logical.vhdl countbits.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
|
||||
loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \
|
||||
core.vhdl fpu.vhdl pmu.vhdl
|
||||
|
||||
|
||||
130
countbits.vhdl
Normal file
130
countbits.vhdl
Normal file
@ -0,0 +1,130 @@
|
||||
library ieee;
|
||||
use ieee.std_logic_1164.all;
|
||||
use ieee.numeric_std.all;
|
||||
|
||||
library work;
|
||||
use work.helpers.all;
|
||||
|
||||
entity bit_counter is
|
||||
port (
|
||||
clk : in std_logic;
|
||||
rs : in std_ulogic_vector(63 downto 0);
|
||||
count_right : in std_ulogic;
|
||||
do_popcnt : in std_ulogic;
|
||||
is_32bit : in std_ulogic;
|
||||
datalen : in std_ulogic_vector(3 downto 0);
|
||||
result : out std_ulogic_vector(63 downto 0)
|
||||
);
|
||||
end entity bit_counter;
|
||||
|
||||
architecture behaviour of bit_counter is
|
||||
-- signals for count-leading/trailing-zeroes
|
||||
signal inp : std_ulogic_vector(63 downto 0);
|
||||
signal sum : std_ulogic_vector(64 downto 0);
|
||||
signal msb_r : std_ulogic;
|
||||
signal onehot : std_ulogic_vector(63 downto 0);
|
||||
signal onehot_r : std_ulogic_vector(63 downto 0);
|
||||
signal bitnum : std_ulogic_vector(5 downto 0);
|
||||
signal cntz : std_ulogic_vector(63 downto 0);
|
||||
|
||||
-- signals for popcnt
|
||||
signal dlen_r : std_ulogic_vector(3 downto 0);
|
||||
signal pcnt_r : std_ulogic;
|
||||
subtype twobit is unsigned(1 downto 0);
|
||||
type twobit32 is array(0 to 31) of twobit;
|
||||
signal pc2 : twobit32;
|
||||
subtype threebit is unsigned(2 downto 0);
|
||||
type threebit16 is array(0 to 15) of threebit;
|
||||
signal pc4 : threebit16;
|
||||
subtype fourbit is unsigned(3 downto 0);
|
||||
type fourbit8 is array(0 to 7) of fourbit;
|
||||
signal pc8 : fourbit8;
|
||||
signal pc8_r : fourbit8;
|
||||
subtype sixbit is unsigned(5 downto 0);
|
||||
type sixbit2 is array(0 to 1) of sixbit;
|
||||
signal pc32 : sixbit2;
|
||||
signal popcnt : std_ulogic_vector(63 downto 0);
|
||||
|
||||
begin
|
||||
countzero_r: process(clk)
|
||||
begin
|
||||
if rising_edge(clk) then
|
||||
msb_r <= sum(64);
|
||||
onehot_r <= onehot;
|
||||
end if;
|
||||
end process;
|
||||
|
||||
countzero: process(all)
|
||||
begin
|
||||
if is_32bit = '0' then
|
||||
if count_right = '0' then
|
||||
inp <= bit_reverse(rs);
|
||||
else
|
||||
inp <= rs;
|
||||
end if;
|
||||
else
|
||||
inp(63 downto 32) <= x"FFFFFFFF";
|
||||
if count_right = '0' then
|
||||
inp(31 downto 0) <= bit_reverse(rs(31 downto 0));
|
||||
else
|
||||
inp(31 downto 0) <= rs(31 downto 0);
|
||||
end if;
|
||||
end if;
|
||||
|
||||
sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
|
||||
onehot <= sum(63 downto 0) and inp;
|
||||
|
||||
-- The following occurs after a clock edge
|
||||
bitnum <= bit_number(onehot_r);
|
||||
|
||||
cntz <= 57x"0" & msb_r & bitnum;
|
||||
end process;
|
||||
|
||||
popcnt_r: process(clk)
|
||||
begin
|
||||
if rising_edge(clk) then
|
||||
for i in 0 to 7 loop
|
||||
pc8_r(i) <= pc8(i);
|
||||
end loop;
|
||||
dlen_r <= datalen;
|
||||
pcnt_r <= do_popcnt;
|
||||
end if;
|
||||
end process;
|
||||
|
||||
popcnt_a: process(all)
|
||||
begin
|
||||
for i in 0 to 31 loop
|
||||
pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1));
|
||||
end loop;
|
||||
for i in 0 to 15 loop
|
||||
pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1));
|
||||
end loop;
|
||||
for i in 0 to 7 loop
|
||||
pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1));
|
||||
end loop;
|
||||
|
||||
-- after a clock edge
|
||||
for i in 0 to 1 loop
|
||||
pc32(i) <= ("00" & pc8_r(i * 4)) + ("00" & pc8_r(i * 4 + 1)) +
|
||||
("00" & pc8_r(i * 4 + 2)) + ("00" & pc8_r(i * 4 + 3));
|
||||
end loop;
|
||||
|
||||
popcnt <= (others => '0');
|
||||
if dlen_r(3 downto 2) = "00" then
|
||||
-- popcntb
|
||||
for i in 0 to 7 loop
|
||||
popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8_r(i));
|
||||
end loop;
|
||||
elsif dlen_r(3) = '0' then
|
||||
-- popcntw
|
||||
for i in 0 to 1 loop
|
||||
popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i));
|
||||
end loop;
|
||||
else
|
||||
popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1)));
|
||||
end if;
|
||||
end process;
|
||||
|
||||
result <= cntz when pcnt_r = '0' else popcnt;
|
||||
|
||||
end behaviour;
|
||||
@ -11,11 +11,11 @@ use work.common.all;
|
||||
library osvvm;
|
||||
use osvvm.RandomPkg.all;
|
||||
|
||||
entity countzero_tb is
|
||||
entity countbits_tb is
|
||||
generic (runner_cfg : string := runner_cfg_default);
|
||||
end countzero_tb;
|
||||
end countbits_tb;
|
||||
|
||||
architecture behave of countzero_tb is
|
||||
architecture behave of countbits_tb is
|
||||
constant clk_period: time := 10 ns;
|
||||
signal rs: std_ulogic_vector(63 downto 0);
|
||||
signal is_32bit, count_right: std_ulogic := '0';
|
||||
@ -23,13 +23,15 @@ architecture behave of countzero_tb is
|
||||
signal clk: std_ulogic;
|
||||
|
||||
begin
|
||||
zerocounter_0: entity work.zero_counter
|
||||
bitcounter_0: entity work.bit_counter
|
||||
port map (
|
||||
clk => clk,
|
||||
rs => rs,
|
||||
result => res,
|
||||
count_right => count_right,
|
||||
is_32bit => is_32bit
|
||||
is_32bit => is_32bit,
|
||||
do_popcnt => '0',
|
||||
datalen => "0000"
|
||||
);
|
||||
|
||||
clk_process: process
|
||||
@ -1,60 +0,0 @@
|
||||
library ieee;
|
||||
use ieee.std_logic_1164.all;
|
||||
use ieee.numeric_std.all;
|
||||
|
||||
library work;
|
||||
use work.helpers.all;
|
||||
|
||||
entity zero_counter is
|
||||
port (
|
||||
clk : in std_logic;
|
||||
rs : in std_ulogic_vector(63 downto 0);
|
||||
count_right : in std_ulogic;
|
||||
is_32bit : in std_ulogic;
|
||||
result : out std_ulogic_vector(63 downto 0)
|
||||
);
|
||||
end entity zero_counter;
|
||||
|
||||
architecture behaviour of zero_counter is
|
||||
signal inp : std_ulogic_vector(63 downto 0);
|
||||
signal sum : std_ulogic_vector(64 downto 0);
|
||||
signal msb_r : std_ulogic;
|
||||
signal onehot : std_ulogic_vector(63 downto 0);
|
||||
signal onehot_r : std_ulogic_vector(63 downto 0);
|
||||
signal bitnum : std_ulogic_vector(5 downto 0);
|
||||
|
||||
begin
|
||||
countzero_r: process(clk)
|
||||
begin
|
||||
if rising_edge(clk) then
|
||||
msb_r <= sum(64);
|
||||
onehot_r <= onehot;
|
||||
end if;
|
||||
end process;
|
||||
|
||||
countzero: process(all)
|
||||
begin
|
||||
if is_32bit = '0' then
|
||||
if count_right = '0' then
|
||||
inp <= bit_reverse(rs);
|
||||
else
|
||||
inp <= rs;
|
||||
end if;
|
||||
else
|
||||
inp(63 downto 32) <= x"FFFFFFFF";
|
||||
if count_right = '0' then
|
||||
inp(31 downto 0) <= bit_reverse(rs(31 downto 0));
|
||||
else
|
||||
inp(31 downto 0) <= rs(31 downto 0);
|
||||
end if;
|
||||
end if;
|
||||
|
||||
sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
|
||||
onehot <= sum(63 downto 0) and inp;
|
||||
|
||||
-- The following occurs after a clock edge
|
||||
bitnum <= bit_number(onehot_r);
|
||||
|
||||
result <= x"00000000000000" & "0" & msb_r & bitnum;
|
||||
end process;
|
||||
end behaviour;
|
||||
@ -215,7 +215,6 @@ architecture behaviour of decode2 is
|
||||
OP_AND => "001", -- logical_result
|
||||
OP_OR => "001",
|
||||
OP_XOR => "001",
|
||||
OP_POPCNT => "001",
|
||||
OP_PRTY => "001",
|
||||
OP_CMPB => "001",
|
||||
OP_EXTS => "001",
|
||||
@ -234,7 +233,8 @@ architecture behaviour of decode2 is
|
||||
OP_DIV => "011",
|
||||
OP_DIVE => "011",
|
||||
OP_MOD => "011",
|
||||
OP_CNTZ => "100", -- countzero_result
|
||||
OP_CNTZ => "100", -- countbits_result
|
||||
OP_POPCNT => "100",
|
||||
OP_MFSPR => "101", -- spr_result
|
||||
OP_B => "110", -- next_nia
|
||||
OP_BC => "110",
|
||||
|
||||
@ -106,7 +106,8 @@ architecture behaviour of execute1 is
|
||||
signal rotator_result: std_ulogic_vector(63 downto 0);
|
||||
signal rotator_carry: std_ulogic;
|
||||
signal logical_result: std_ulogic_vector(63 downto 0);
|
||||
signal countzero_result: std_ulogic_vector(63 downto 0);
|
||||
signal do_popcnt: std_ulogic;
|
||||
signal countbits_result: std_ulogic_vector(63 downto 0);
|
||||
signal alu_result: std_ulogic_vector(63 downto 0);
|
||||
signal adder_result: std_ulogic_vector(63 downto 0);
|
||||
signal misc_result: std_ulogic_vector(63 downto 0);
|
||||
@ -284,13 +285,15 @@ begin
|
||||
datalen => e_in.data_len
|
||||
);
|
||||
|
||||
countzero_0: entity work.zero_counter
|
||||
countbits_0: entity work.bit_counter
|
||||
port map (
|
||||
clk => clk,
|
||||
rs => c_in,
|
||||
count_right => e_in.insn(10),
|
||||
is_32bit => e_in.is_32bit,
|
||||
result => countzero_result
|
||||
do_popcnt => do_popcnt,
|
||||
datalen => e_in.data_len,
|
||||
result => countbits_result
|
||||
);
|
||||
|
||||
multiply_0: entity work.multiply
|
||||
@ -391,7 +394,7 @@ begin
|
||||
logical_result when "001",
|
||||
rotator_result when "010",
|
||||
muldiv_result when "011",
|
||||
countzero_result when "100",
|
||||
countbits_result when "100",
|
||||
spr_result when "101",
|
||||
next_nia when "110",
|
||||
misc_result when others;
|
||||
@ -813,6 +816,8 @@ begin
|
||||
rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0';
|
||||
rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0';
|
||||
|
||||
do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0';
|
||||
|
||||
illegal := '0';
|
||||
if r.intr_pending = '1' then
|
||||
v.e.srr1 := r.e.srr1;
|
||||
@ -963,7 +968,7 @@ begin
|
||||
when OP_ADDG6S =>
|
||||
when OP_CMPRB =>
|
||||
when OP_CMPEQB =>
|
||||
when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS |
|
||||
when OP_AND | OP_OR | OP_XOR | OP_PRTY | OP_CMPB | OP_EXTS |
|
||||
OP_BPERM | OP_BCD =>
|
||||
|
||||
when OP_B =>
|
||||
@ -1025,7 +1030,7 @@ begin
|
||||
end if;
|
||||
do_trace := '0';
|
||||
|
||||
when OP_CNTZ =>
|
||||
when OP_CNTZ | OP_POPCNT =>
|
||||
v.e.valid := '0';
|
||||
v.cntz_in_progress := '1';
|
||||
v.busy := '1';
|
||||
@ -1220,7 +1225,7 @@ begin
|
||||
-- valid_in = 0. Hence they don't happen in the same cycle as any of
|
||||
-- the cases above which depend on valid_in = 1.
|
||||
if r.cntz_in_progress = '1' then
|
||||
-- cnt[lt]z always takes two cycles
|
||||
-- cnt[lt]z and popcnt* always take two cycles
|
||||
v.e.valid := '1';
|
||||
elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then
|
||||
if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
|
||||
|
||||
44
logical.vhdl
44
logical.vhdl
@ -20,20 +20,7 @@ end entity logical;
|
||||
|
||||
architecture behaviour of logical is
|
||||
|
||||
subtype twobit is unsigned(1 downto 0);
|
||||
type twobit32 is array(0 to 31) of twobit;
|
||||
signal pc2 : twobit32;
|
||||
subtype threebit is unsigned(2 downto 0);
|
||||
type threebit16 is array(0 to 15) of threebit;
|
||||
signal pc4 : threebit16;
|
||||
subtype fourbit is unsigned(3 downto 0);
|
||||
type fourbit8 is array(0 to 7) of fourbit;
|
||||
signal pc8 : fourbit8;
|
||||
subtype sixbit is unsigned(5 downto 0);
|
||||
type sixbit2 is array(0 to 1) of sixbit;
|
||||
signal pc32 : sixbit2;
|
||||
signal par0, par1 : std_ulogic;
|
||||
signal popcnt : std_ulogic_vector(63 downto 0);
|
||||
signal parity : std_ulogic_vector(63 downto 0);
|
||||
signal permute : std_ulogic_vector(7 downto 0);
|
||||
|
||||
@ -109,35 +96,6 @@ begin
|
||||
variable negative : std_ulogic;
|
||||
variable j : integer;
|
||||
begin
|
||||
-- population counts
|
||||
for i in 0 to 31 loop
|
||||
pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1));
|
||||
end loop;
|
||||
for i in 0 to 15 loop
|
||||
pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1));
|
||||
end loop;
|
||||
for i in 0 to 7 loop
|
||||
pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1));
|
||||
end loop;
|
||||
for i in 0 to 1 loop
|
||||
pc32(i) <= ("00" & pc8(i * 4)) + ("00" & pc8(i * 4 + 1)) +
|
||||
("00" & pc8(i * 4 + 2)) + ("00" & pc8(i * 4 + 3));
|
||||
end loop;
|
||||
popcnt <= (others => '0');
|
||||
if datalen(3 downto 2) = "00" then
|
||||
-- popcntb
|
||||
for i in 0 to 7 loop
|
||||
popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8(i));
|
||||
end loop;
|
||||
elsif datalen(3) = '0' then
|
||||
-- popcntw
|
||||
for i in 0 to 1 loop
|
||||
popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i));
|
||||
end loop;
|
||||
else
|
||||
popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1)));
|
||||
end if;
|
||||
|
||||
-- parity calculations
|
||||
par0 <= rs(0) xor rs(8) xor rs(16) xor rs(24);
|
||||
par1 <= rs(32) xor rs(40) xor rs(48) xor rs(56);
|
||||
@ -178,8 +136,6 @@ begin
|
||||
tmp := not tmp;
|
||||
end if;
|
||||
|
||||
when OP_POPCNT =>
|
||||
tmp := popcnt;
|
||||
when OP_PRTY =>
|
||||
tmp := parity;
|
||||
when OP_CMPB =>
|
||||
|
||||
@ -18,7 +18,7 @@ filesets:
|
||||
- ppc_fx_insns.vhdl
|
||||
- sim_console.vhdl
|
||||
- logical.vhdl
|
||||
- countzero.vhdl
|
||||
- countbits.vhdl
|
||||
- control.vhdl
|
||||
- execute1.vhdl
|
||||
- fpu.vhdl
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user