# Square root

This FPGA Project was completed in July 2015

I needed to calculate square roots, and the standard software method (the e Newtonâ€“Raphson method - https://en.wikipedia.org/wiki/Newton's_method ) isn't well suited to implementation in and FPGA as it uses division.

The method below doesn't require division, and can generate an additional bit of result every clock cycle.

It can also be extended to generate fixed-point binary decimal results too.

The input and output are both unconstrained vectors, and 'result' must be long enough to hold the sqrt when "number" is all ones.

Sorry that it is a little bit ugly due to allowing 'number' to have an odd number of bits.

## A C implementation of the algorithm

```/*****************************************
* my_sqrt.c: Implmenting an integer square
*            root using only bit-shifts
*
* Author: Mike Field <hamster@snap.net.nz>
*
*****************************************/
#include <stdio.h>
#include <math.h>

unsigned my_sqrt(unsigned a) {
unsigned s = 0; /* Where the result is assembled */
int      q = 15;  /* Bit we are working on */

while(q >= 0) {
/* 'change' represents how much setting bit
* 'q' in 's' will change the value of s^2
*
* It is (2^q)*(2^q) + s*(2^q) + s*(2^q),
* but can be calculated using addition and
* bit shift operations.
*/
unsigned change = (1<<(q+q))+ (s<<(q+1));
if(a >= change) {
a -= change;
s |= 1<<q;
}
q--;
}
return s;
}

/***************************************
* Verify my square root implementation
* for all 32-bit unsigned integers
**************************************/
int main(int argc, char *argv[])
{
unsigned a=0;

do {
unsigned s = my_sqrt(a);
if(s*s>a || s*s+2*s+1 < a) {
printf("ERROR %u: %u\r\n",a, my_sqrt(a));
}
a++;
} while(a != 0);
return 0;
}

```

## sqrt.vhd

```-------------------------------------------
-- File:   sqrt.vhd
--
-- Author: Mike Field <hamster@snap.net.nz>
--
-- Calculate the square root of an integer
-- in length/2+1 cycles. (e.g a 31 or 32 bit
-- integer takes 16 cycles to find the result)
--
-- It relys on the fact that setting bit 'n' in
-- a partial result 'p' will increase the value
-- of p*p by 2^n*2^n+p*2^n+p*2^n. This can all
-- be calculated using bit shifts (which are
-- free in FPGAs
-------------------------------------------
library IEEE;
use IEEE.STD_LOGIC_1164.ALL;
use IEEE.NUMERIC_STD.ALL;

entity sqrt is
Port ( clk : in  STD_LOGIC;
number : in  STD_LOGIC_VECTOR;
result : out  STD_LOGIC_VECTOR);
end sqrt;

architecture Behavioral of sqrt is
type a_partial   is array(result'high downto 0) of unsigned(result'range);
type a_remaining is array(result'high downto 0) of unsigned(number'range);
signal partial   : a_partial   := (others => (others => '0'));
signal remaining : a_remaining := (others => (others => '0'));
signal zeros : unsigned (number'high downto 0) := (others => '0');
begin
-----------------------------------------------
-- Calculate the square root, one bit per cycle.
-----------------------------------------------
calc_proc: process(clk)
variable change : unsigned(number'range);
begin
if rising_edge(clk) then

-------------------------------------------------------------------------------------
-- Extracting the topmost bit of the result is different, as we have to allow
-- for an odd number of bits in "number".
--
-- If 'number' has an odd number of bits, then is the MSB is set
-- If 'number' has an even number of bits, are any of the top bits are set
--
-- If so, set the highest bit in result.
-------------------------------------------------------------------------------------
if unsigned(number(number'high downto (number'high/2)*2)) > 0 then
-- Calculate the first partial result, by copying everything over
remaining(number'high/2) <= unsigned(number);
-- then subtracting 1 from the highest bit/bits
remaining(number'high/2)(number'high downto (number'high/2)*2) <= unsigned(number(number'high downto (number'high/2)*2))-1;
partial(partial'high)     <= (number'high/2 => '1', others => '0');
else
-- Top bit of partial result is not set
remaining(remaining'high) <= unsigned(number);
partial(partial'high)     <= (others => '0');
end if;

------------------------------------------------
-- Now calculate all the other bits the hard way
------------------------------------------------
for i in number'high/2-1 downto 0 loop
change := zeros + ("1" & zeros(i-1 downto 0) & zeros(i-1 downto 0)) + (partial(i+1) & zeros(i downto 0));
if remaining(i+1) >= change then
remaining(i)  <= remaining(i+1) - change;
partial(i)    <= partial(i+1);
partial(i)(i) <= '1';
else
remaining(i) <= remaining(i+1);
partial(i)   <= partial(i+1);
end if;
end loop;
end if;
end process;

-----------------------------------------------
-- assign the output to the final result
-----------------------------------------------
result <= std_logic_vector(partial(0));

end Behavioral;

```

## Test bench

This is a test bench that will check sqrt is working. If there is an error 'fault' will be asserted

Fault will be asserted for the first 15 cycles or so as the pipeline fills.

```LIBRARY ieee;
USE ieee.std_logic_1164.ALL;
USE ieee.numeric_std.ALL;

ENTITY tb_sqrt IS
END tb_sqrt;

ARCHITECTURE behavior OF tb_sqrt IS
COMPONENT sqrt
PORT(
clk : IN  std_logic;
number : IN  std_logic_vector;
result : OUT  std_logic_vector
);
END COMPONENT;

signal number : std_logic_vector(31 downto 0) := (others => '1');
signal result : std_logic_vector(15 downto 0) := (others => '0');
signal clk : std_logic := '0';
signal fault : std_logic := '0';
constant clk_period : time := 10 ns;
BEGIN
uut: sqrt PORT MAP (
clk => clk,
number => number,
result => result
);

-- Clock process definitions
clk_process :process
begin
clk <= '0';
wait for clk_period/2;
clk <= '1';
wait for clk_period/2;

end process;

process(clk)
begin
if rising_edge(clK) then
number <= std_logic_vector(unsigned(number)+1);
if unsigned(result) * unsigned(result) > unsigned(number)-16 or
(unsigned(result)+1) * (unsigned(result)+1) <= unsigned(number)-16 then
fault <= '1';
else
fault <= '0';
end if;
end if;
end process;
end;

```

## Comparison with standard IP

As written,this uses 392 registers, 590 LUTs for a total of 169 slices. The supplied Cordic IP uses 341 registers, 427 LUTs fora total of 137 slices, about 25% percent more. Can it be optimized smaller?

Sure! By looking at the maximum value of 'remainder' we can see that some bits will always be zero:

Stage Maximum value for remainder
15 BFFFFFFF
14 6FFFFFFF
13 3BFFFFFF
12 1EFFFFFF
11 FBFFFFF
10 7EFFFFF
9 3FBFFFF
8 1FEFFFF
7 FFBFFF
6 7FEFFF
5 3FFBFF
4 1FFEFF
3 FFFBF
2 7FFEF
1 3FFFB
0 1FFFE

### Improved code

This weighs in at 301 registers, 379 LUTs and 111 slices - about 80% of the size of the Cordic IP.

```-------------------------------------------
-- File:   sqrt.vhd
--
-- Author: Mike Field <hamster@snap.net.nz>
--
-- Calculate the square root of an integer
-- in length/2+1 cycles. (e.g a 31 or 32 bit
-- integer takes 16 cycles to find the result)
--
-- It relys on the fact that setting bit 'n' in
-- a partial result 'p' will increase the value
-- of p*p by 2^n*2^n+p*2^n+p*2^n. This can all
-- be calculated using bit shifts (which are
-- free in FPGAs
-------------------------------------------
library IEEE;
use IEEE.STD_LOGIC_1164.ALL;
use IEEE.NUMERIC_STD.ALL;

entity sqrt is
Port ( clk : in  STD_LOGIC;
number : in  STD_LOGIC_VECTOR;
result : out  STD_LOGIC_VECTOR);
end sqrt;

architecture Behavioral of sqrt is
type a_partial   is array(result'high downto 0) of unsigned(result'range);
type a_remaining is array(result'high downto 0) of unsigned(number'range);
signal partial   : a_partial   := (others => (others => '0'));
signal remaining : a_remaining := (others => (others => '0'));
signal zeros : unsigned (number'high downto 0) := (others => '0');
begin
-----------------------------------------------
-- Calculate the square root, one bit per cycle.
-----------------------------------------------
calc_proc: process(clk)
variable change : unsigned(number'range);
begin
if rising_edge(clk) then

-------------------------------------------------------------------------------------
-- Extracting the topmost bit of the result is different, as we have to allow
-- for an odd number of bits in "number".
--
-- If 'number' has an odd number of bits, then is the MSB is set
-- If 'number' has an even number of bits, are any of the top bits are set
--
-- If so, set the highest bit in result.
-------------------------------------------------------------------------------------
if unsigned(number(number'high downto (number'high/2)*2)) > 0 then
-- Calculate the first partial result, by copying everything over
remaining(number'high/2) <= unsigned(number);
-- then subtracting 1 from the highest bit/bits
remaining(number'high/2)(number'high downto (number'high/2)*2) <= unsigned(number(number'high downto (number'high/2)*2))-1;
partial(partial'high)     <= (number'high/2 => '1', others => '0');
else
-- Top bit of partial result is not set
remaining(remaining'high) <= unsigned(number);
partial(partial'high)     <= (others => '0');
end if;

------------------------------------------------
-- Now calculate all the other bits the hard way
------------------------------------------------
for i in number'high/2-1 downto 0 loop
change := zeros + ("1" & zeros(i-1 downto 0) & zeros(i-1 downto 0)) + (partial(i+1) & zeros(i downto 0));
if remaining(i+1) >= change then
remaining(i)  <= remaining(i+1) - change;
partial(i)    <= partial(i+1);
partial(i)(i) <= '1';
else
remaining(i) <= remaining(i+1);
partial(i)   <= partial(i+1);
end if;
end loop;

----------------------------------------
-- Finally zero out the bits that
-- we know will be zero, to give the
-- the logic optimizer hints.
----------------------------------
for i in number'high/2-1 downto 0 loop
remaining(i)(number'high downto i+(number'high)/2+2) <= (others => '0');
end loop;
end if;

end process;

-----------------------------------------------
-- assign the output to the final result
-----------------------------------------------
result <= std_logic_vector(partial(0));

end Behavioral;

```