mirror of
https://github.com/antonblanchard/microwatt.git
synced 2026-04-03 12:03:15 +00:00
countzero: Use alternative algorithm for higher bits
This implements an alternative count-leading-zeroes algorithm which uses less LUTs to generate the higher-order bits (2..5) of the result. By doing (v | -v) rather than (v & -v), we get a value which has ones from the MSB down to the rightmost 1 bit in v and then zeroes down to the LSB. This means that we can generate the MSB of the result (the index of the rightmost 1 bit in v) just by looking at bits 63 and 31 of (v | -v), assuming that v is 64 bits. Bit 4 of the result requires looking at bits 63, 47, 31 and 15. In contrast, each bit of the result using (v & -v), which has a single 1, requires ORing together 32 bits. It turns out that the minimum LUT usage comes from using (v & -v) to generate bits 0 and 1 of the result, and using (v | -v) to generate bits 2 to 5. This saves almost 60 6-input LUTs on the Artix-7. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
This commit is contained in:
@@ -20,10 +20,11 @@ end entity bit_counter;
|
||||
architecture behaviour of bit_counter is
|
||||
-- signals for count-leading/trailing-zeroes
|
||||
signal inp : std_ulogic_vector(63 downto 0);
|
||||
signal inp_r : std_ulogic_vector(63 downto 0);
|
||||
signal sum : std_ulogic_vector(64 downto 0);
|
||||
signal msb_r : std_ulogic;
|
||||
signal sum_r : std_ulogic_vector(64 downto 0);
|
||||
signal onehot : std_ulogic_vector(63 downto 0);
|
||||
signal onehot_r : std_ulogic_vector(63 downto 0);
|
||||
signal edge : std_ulogic_vector(63 downto 0);
|
||||
signal bitnum : std_ulogic_vector(5 downto 0);
|
||||
signal cntz : std_ulogic_vector(63 downto 0);
|
||||
|
||||
@@ -45,16 +46,36 @@ architecture behaviour of bit_counter is
|
||||
signal pc32 : sixbit2;
|
||||
signal popcnt : std_ulogic_vector(63 downto 0);
|
||||
|
||||
function edgelocation(v: std_ulogic_vector; nbits: natural) return std_ulogic_vector is
|
||||
variable p: std_ulogic_vector(nbits - 1 downto 0);
|
||||
variable stride: natural;
|
||||
variable b: std_ulogic;
|
||||
variable k: natural;
|
||||
begin
|
||||
stride := 2;
|
||||
for i in 0 to nbits - 1 loop
|
||||
b := '0';
|
||||
for j in 0 to (2**nbits / stride) - 1 loop
|
||||
k := j * stride;
|
||||
b := b or (v(k + stride - 1) and not v(k + (stride/2) - 1));
|
||||
end loop;
|
||||
p(i) := b;
|
||||
stride := stride * 2;
|
||||
end loop;
|
||||
return p;
|
||||
end function;
|
||||
|
||||
begin
|
||||
countzero_r: process(clk)
|
||||
begin
|
||||
if rising_edge(clk) then
|
||||
msb_r <= sum(64);
|
||||
onehot_r <= onehot;
|
||||
inp_r <= inp;
|
||||
sum_r <= sum;
|
||||
end if;
|
||||
end process;
|
||||
|
||||
countzero: process(all)
|
||||
variable bitnum_e, bitnum_o : std_ulogic_vector(5 downto 0);
|
||||
begin
|
||||
if is_32bit = '0' then
|
||||
if count_right = '0' then
|
||||
@@ -72,12 +93,16 @@ begin
|
||||
end if;
|
||||
|
||||
sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
|
||||
onehot <= sum(63 downto 0) and inp;
|
||||
|
||||
-- The following occurs after a clock edge
|
||||
bitnum <= bit_number(onehot_r);
|
||||
edge <= sum_r(63 downto 0) or inp_r;
|
||||
bitnum_e := edgelocation(edge, 6);
|
||||
onehot <= sum_r(63 downto 0) and inp_r;
|
||||
bitnum_o := bit_number(onehot);
|
||||
bitnum(5 downto 2) <= bitnum_e(5 downto 2);
|
||||
bitnum(1 downto 0) <= bitnum_o(1 downto 0);
|
||||
|
||||
cntz <= 57x"0" & msb_r & bitnum;
|
||||
cntz <= 57x"0" & sum_r(64) & bitnum;
|
||||
end process;
|
||||
|
||||
popcnt_r: process(clk)
|
||||
|
||||
Reference in New Issue
Block a user