///////////////////////////////////////////////////////
// ling.v                                            //
//                                                   //
// Written 10/16/96 by David Harris harrisd@leland   //
// EE371 Problem Set 4                               //
//                                                   //
// Corrected 10/23/96:  fixed C16 logic              //
// Updated 11/8/96: changed for symmetric _h, _l     //
//                                                   //
// This file models a 64 bit Ling adder based on     //
// the 1996 ISSCC paper (SP22.5) by S. Naffziger.    //
//                                                   //
///////////////////////////////////////////////////////

// This Verilog file contains a model of the 64 bit Ling adder
// as best as could be constructed from the ISSCC paper.  It
// also contains instrumentation to generate random test vectors
// and check the results.  If the adder passes a million
// random tests, it probably is logically correct (or at least has
// a pretty subtle bug!)
//
// The paper is pretty good, but, like most papers, has some
// errors and some important omissions.  As far as I can tell:
//
//  The "Short Carry Generate" circuit in the slide supplement
//  disagrees with Figure 2 from the Digest of Technical Papers.
//  I believe the slides are correct (there is no G0 term).
//
//  C16 is written as (H0+I0)*I1*I2*I3 + H1*I2*I3 + H2*I3 + H3
//  in the Digest, but as H3 + H2*I3 + H1*I2*I3 + H0*I1*I2*I3
//  in the slide supplement.  Both are wrong and both are correct.
//  The I0 term is required for the low-order 16 bit block to
//  incorporate the adder's cin term.  The I0 term is left out
//  of the other 16 bit blocks.
//
// Another key point mentioned in correspondence with the author
// but only cryptically in the paper is that I4 is not P0*P1*P2*P3 as one might
// expect, but rather P-1*P0*P1*P2, where P-1 is the propagate
// from the previous block of 4 (or the carry in to the very first
// block).  This is nonobvious, but causes the carry chain to work
// out nicely.
//
// Finally, there is a P term missing from the equations for true
// carries in.  For example, Long_C (the carry in to a 16 bit block)
// is defined for the high 16 bits to be C48 + C32*P48 + C16*P32*P48.
// The real carry in to the high 16 bits is Long_C*P47, where P47 is
// A47 + B47.  This propagate is factored into the short carry chain
// instead.
// 
// This model doesn't strictly match the ISSCC paper for the "short carry
// ripple" logic.  The model propagates 4-bit pseudo-carries through three
// domino gates, then trips a mux which selects the 1-bit real carries
// which have been computed assuming pseudo-carry of 0 or 1 into the
// 4 bit block.  Sam somehow propagates real 4 bit carries using his C3
// term; I am not entirely clear on the logic.
//
// All of the circuits are implemented with domino.  Instead of using
// "true" dual rail domino where _h and _l signals are complementary,
// _h and _l are redefined to mean "applies to a 1" and "applies to a 0".
// For example I_h means the propagate for a 1, while I_l means the
// propagate for a 0.  H_h means that a 1 is (pseudo)generated, while H_l
// means that a 0 is pseudogenerated, not that a 1 is not generated.
// The adder logic has interesting self-dual properties which make everything
// work out so that the carries are produced properly.  Moreover, the
// same kind of gate can now be used for _h and _l which makes layout and
// circuit design simpler.
//

///////////////////////////////
// Top Level Instrumentation //
///////////////////////////////
module top();
 
  // Input and output busses
  wire [63:0] a;
  wire [63:0] b;
  wire        cin;
  wire [63:0] r_h, r_l;
  wire        clk;
 
  // Instantiate adder
  adder64  adder(a, ~a, b, ~b, cin, ~cin, r_h, r_l);
 
  // Clock generator
  clock    clockgen(clk);
 
  // Instrumentation to test adder
  stimulus stimgen(a, b, cin, clk);
  checker  stimchk(a, b, cin, r_h, clk);
 
endmodule
 
/////////////
// adder64 //
/////////////
module adder64(a_h, a_l, b_h, b_l, cin_h, cin_l, r_h, r_l);
 
  // A 64 bit ling adder based on Naffziger, ISSCC 96
 
  // Interface (neglect fact adder is clocked)
  input  [63:0] a_h, a_l;
  input  [63:0] b_h, b_l;
  input		cin_h, cin_l;
  output [63:0] r_h, r_l;
 
  // Internal signals

  wire         p16_h, p32_h, p48_h, p64_h;  // propagate out of 16 bit blocks
  wire         p16_l, p32_l, p48_l, p64_l;  
  wire	       c16_h, c32_h, c48_h, c64_h;  // carries out of 16 bit blocks
  wire	       c16_l, c32_l, c48_l, c64_l;
  wire	       long_c16_h, long_c32_h, long_c48_h; // carries into 16 bit blks
  wire	       long_c16_l, long_c32_l, long_c48_l; // carries into 16 bit blks
 
  // Sub-blocks

  // 1'b0 indicates the 1-digit binary value 0 in Verilog
  // the low-order 16 bit block is slightly different because its c16 logic
  // factors in the cin term.
  adder16low adder16_0({a_h[15:0], cin_h}, {a_l[15:0], cin_l},
		    {b_h[15:0], cin_h}, {b_l[15:0], cin_l}, 
		    r_h[15:0], r_l[15:0], 
		    c16_h, c16_l, p16_h, p16_l, cin_h, cin_l);
  adder16 adder16_1(a_h[31:15], a_l[31:15], b_h[31:15], b_l[31:15],
		    r_h[31:16], r_l[31:16],
		    c32_h, c32_l, p32_h, p32_l, long_c16_h, long_c16_l);
  adder16 adder16_2(a_h[47:31], a_l[47:31], b_h[47:31], b_l[47:31],
		    r_h[47:32], r_l[47:32], 
		    c48_h, c48_l, p48_h, p48_l, long_c32_h, long_c32_l);
  adder16 adder16_3(a_h[63:47], a_l[63:47], b_h[63:47], b_l[63:47],
		    r_h[63:48], r_l[63:48], 
		    c64_h, c64_l, p64_h, p64_l, long_c48_h, long_c48_l);

  // Compute long carries into 16 bit blocks

  assign #1 long_c48_h = c48_h || p48_h && (c32_h || p32_h && c16_h);
  assign #1 long_c32_h = c32_h || p32_h && c16_h;
  assign #1 long_c16_h = c16_h;

  assign #1 long_c48_l = c48_l || p48_l && (c32_l || p32_l && c16_l);
  assign #1 long_c32_l = c32_l || p32_l && c16_l;
  assign #1 long_c16_l = c16_l;

endmodule

module adder16(a_h, a_l, b_h, b_l, r_h, r_l,
	       c16_h, c16_l, p16_h, p16_l, longc_h, longc_l);

  // Interface
  input  [16:0] a_h, a_l;	  // really bits [15:-1]
  input  [16:0] b_h, b_l;	  // really bits [15:-1]
  output [15:0] r_h, r_l;
  output        c16_h, c16_l; 	  // carry produced by 16 bit block
  output        p16_h, p16_l; 	  // 16 bit propagate
  input  	longc_h, longc_l; // carry in to 16 bit block

  // Internal signals
  wire 	[3:0]	h_h, h_l;	  // H of each 4 bit subblock
  wire 	[3:0]	i_h, i_l;	  // I of each 4 bit subblock
  wire  [4:0]   cin0_h, cin0_l;	  // Carry in to 4 bit block if longc_l = 1
  wire  [4:0]   cin1_h, cin1_l;   // Carry in to 4 bit block if longc_h = 0

  // Sub-blocks

  adder4 adder4_0(a_h[4:0], a_l[4:0], b_h[4:0], b_l[4:0], 
		  r_h[3:0], r_l[3:0],
		  h_h[0], h_l[0], i_h[0], i_l[0],
 		  cin0_h[0], cin0_l[0], cin1_h[0], cin1_l[0],
		  cin0_h[1], cin0_l[1], cin1_h[1], cin1_l[1], 
		  longc_h, longc_l);
  adder4 adder4_1(a_h[8:4], a_l[8:4], b_h[8:4], b_l[8:4],
		  r_h[7:4], r_l[7:4],
		  h_h[1], h_l[1], i_h[1], i_l[1],
 		  cin0_h[1], cin0_l[1], cin1_h[1], cin1_l[1],
		  cin0_h[2], cin0_l[2], cin1_h[2], cin1_l[2], 
		  longc_h, longc_l);
  adder4 adder4_2(a_h[12:8], a_l[12:8], b_h[12:8], b_l[12:8],
		  r_h[11:8], r_l[11:8],
		  h_h[2], h_l[2], i_h[2], i_l[2],
 		  cin0_h[2], cin0_l[2], cin1_h[2], cin1_l[2],
		  cin0_h[3], cin0_l[3], cin1_h[3], cin1_l[3], 
		  longc_h, longc_l);
  adder4 adder4_3(a_h[16:12], a_l[16:12], b_h[16:12], b_l[16:12],
		  r_h[15:12], r_l[15:12],
		  h_h[3], h_l[3], i_h[3], i_l[3],
 		  cin0_h[3], cin0_l[3], cin1_h[3], cin1_l[3],
		  cin0_h[4], cin0_l[4], cin1_h[4], cin1_l[4], 
		  longc_h, longc_l);
  
  // Compute 16 bit C and P

  assign #1 c16_h = h_h[3] || i_h[3] && (h_h[2] || i_h[2] && (h_h[1] || 
		    i_h[1] && h_h[0]));
  assign #1 c16_l = h_l[3] || i_l[3] && (h_l[2] || i_l[2] && (h_l[1] || 
		    i_l[1] && h_l[0]));
  assign #1 p16_h = i_h[3] && i_h[2] && i_h[1] && i_h[0];
  assign #1 p16_l = i_l[3] && i_l[2] && i_l[1] && i_l[0];

  // Set cin to low-order 4 bits
  // Remember that the true carry in to a 16 bit block = cin*Pm where
  // Pm is the propagate through the most significant bit of the 
  // previous block.

  assign #1 cin0_h[0] = 0;
  assign #1 cin1_h[0] = 1;
  assign #1 cin0_l[0] = 1;
  assign #1 cin1_l[0] = 0;

endmodule

module adder16low(a_h, a_l, b_h, b_l, r_h, r_l, 
	          c16_h, c16_l, p16_h, p16_l, longc_h, longc_l);

  // This block is identical to adder16 except the c16 logic
  // is modififed to factor in the cin to the 64 bit adder.

  // Interface
  input  [16:0] a_h, a_l;	  // really bits [15:-1]
  input  [16:0] b_h, b_l;	  // really bits [15:-1]
  output [15:0] r_h, r_l;
  output        c16_h, c16_l; 	  // carry produced by 16 bit block
  output        p16_h, p16_l; 	  // 16 bit propagate
  input  	longc_h, longc_l; // carry in to 16 bit block

  // Internal signals
  wire 	[3:0]	h_h, h_l;	  // H of each 4 bit subblock
  wire 	[3:0]	i_h, i_l;	  // I of each 4 bit subblock
  wire  [4:0]   cin0_h, cin0_l;	  // Carry in to 4 bit block if longc_l = 1
  wire  [4:0]   cin1_h, cin1_l;   // Carry in to 4 bit block if longc_h = 0

  // Sub-blocks

  adder4 adder4_0(a_h[4:0], a_l[4:0], b_h[4:0], b_l[4:0], 
		  r_h[3:0], r_l[3:0],
		  h_h[0], h_l[0], i_h[0], i_l[0],
 		  cin0_h[0], cin0_l[0], cin1_h[0], cin1_l[0],
		  cin0_h[1], cin0_l[1], cin1_h[1], cin1_l[1], 
		  longc_h, longc_l);
  adder4 adder4_1(a_h[8:4], a_l[8:4], b_h[8:4], b_l[8:4],
		  r_h[7:4], r_l[7:4],
		  h_h[1], h_l[1], i_h[1], i_l[1],
 		  cin0_h[1], cin0_l[1], cin1_h[1], cin1_l[1],
		  cin0_h[2], cin0_l[2], cin1_h[2], cin1_l[2], 
		  longc_h, longc_l);
  adder4 adder4_2(a_h[12:8], a_l[12:8], b_h[12:8], b_l[12:8],
		  r_h[11:8], r_l[11:8],
		  h_h[2], h_l[2], i_h[2], i_l[2],
 		  cin0_h[2], cin0_l[2], cin1_h[2], cin1_l[2],
		  cin0_h[3], cin0_l[3], cin1_h[3], cin1_l[3], 
		  longc_h, longc_l);
  adder4 adder4_3(a_h[16:12], a_l[16:12], b_h[16:12], b_l[16:12],
		  r_h[15:12], r_l[15:12],
		  h_h[3], h_l[3], i_h[3], i_l[3],
 		  cin0_h[3], cin0_l[3], cin1_h[3], cin1_l[3],
		  cin0_h[4], cin0_l[4], cin1_h[4], cin1_l[4], 
		  longc_h, longc_l);
  
  // Compute 16 bit C and P

  assign #1 c16_h = h_h[3] || i_h[3] && (h_h[2] || i_h[2] && (h_h[1] || 
		    i_h[1] && (h_h[0] || i_h[0])));
  assign #1 c16_l = h_l[3] || i_l[3] && (h_l[2] || i_l[2] && (h_l[1] || 
		    i_l[1] && (h_l[0] || i_l[0])));
  assign #1 p16_h = i_h[3] && i_h[2] && i_h[1] && i_h[0];
  assign #1 p16_l = i_l[3] && i_l[2] && i_l[1] && i_l[0];

  // Set cin to low-order 4 bits
  // Remember that the true carry in to a 16 bit block = cin*Pm where
  // Pm is the propagate through the most significant bit of the 
  // previous block.

  assign #1 cin0_h[0] = 0;
  assign #1 cin1_h[0] = 1;
  assign #1 cin0_l[0] = 1;
  assign #1 cin1_l[0] = 0;

endmodule

module adder4(a_h, a_l, b_h, b_l, r_h, r_l,
	      h4_h, h4_l, i4_h, i4_l, cin0_h, cin0_l, cin1_h, cin1_l,
	      cout0_h, cout0_l, cout1_h, cout1_l, longc_h, longc_l);

  // Interface
  input  [4:0] a_h, a_l;	 // really bits [3:-1]
  input  [4:0] b_h, b_l;	 // really bits [3:-1]
  output [3:0] r_h, r_l;
  output       h4_h, h4_l;   	 // pseudo-carry for bits 3:0
  output       i4_h, i4_l;	 // psuedo-propagate (for bits 2:-1)
  input	       cin0_h, cin0_l;	 // carry in to 4-bit block given longc_l
  input	       cin1_h, cin1_l;	 // carry in to 4-bit block given longc_h
  output       cout0_h, cout0_l; // carry out of 4-bit block given longc_l
  output       cout1_h, cout1_l; // carry out of 4-bit block given longc_h
  input        longc_h, longc_l; // carry in to 16 bit block

  // Internal signals
  wire [3:0]   c0_h, c0_l;	 // carry in to 1-bit block given longc_l
  wire [3:0]   c1_h, c1_l;	 // carry in to 1-bit block given longc_h
  wire [3:0]   carries0_h, carries0_l;  // speculative 1 bit carries if cin0=1
  wire [3:0]   carries1_h, carries1_l;  // speculative 1 bit carries if cin1=1
  wire [3:0]   p, g, k;		 // P, G, K from 1 bit blocks

  // Sub-blocks

  adder1 adder1_0(a_h[1], a_l[1], b_h[1], b_l[1], 
		  r_h[0], r_l[0], longc_h, longc_l, 
		  c0_h[0], c0_l[0], c1_h[0], c1_l[0],
		  p[0], g[0], k[0]);
  adder1 adder1_1(a_h[2], a_l[2], b_h[2], b_l[2], 
		  r_h[1], r_l[1], longc_h, longc_l, 
		  c0_h[1], c0_l[1], c1_h[1], c1_l[1],
		  p[1], g[1], k[1]);
  adder1 adder1_2(a_h[3], a_l[3], b_h[3], b_l[3], 
		  r_h[2], r_l[2], longc_h, longc_l, 
		  c0_h[2], c0_l[2], c1_h[2], c1_l[2],
		  p[2], g[2], k[2]);
  adder1 adder1_3(a_h[4], a_l[4], b_h[4], b_l[4], 
		  r_h[3], r_l[3], longc_h, longc_l,
		  c0_h[3], c0_l[3], c1_h[3], c1_l[3],
		  p[3], g[3], k[3]);

  // Compute pseudo-generate & propagate
  // Note:  the H term is for bits [3:0], while the
  //  I term is for bits [2:-1].  This seems funny
  //  at first, but is helpful for carry propagation.

  assign #1 h4_h = a_h[4] && b_h[4] || a_h[3] && b_h[3] || 
		   a_h[2] && b_h[2] && (a_h[3] || b_h[3]) ||
		   a_h[1] && b_h[1] && (a_h[2] || b_h[2]) && 
			(a_h[3] || b_h[3]);
  assign #1 h4_l = a_l[4] && b_l[4] || a_l[3] && b_l[3] || 
		   a_l[2] && b_l[2] && (a_l[3] || b_l[3]) ||
		   a_l[1] && b_l[1] && (a_l[2] || b_l[2]) && 
			(a_l[3] || b_l[3]);
  assign #1 i4_h = (a_h[3] || b_h[3]) && (a_h[2] || b_h[2]) &&
		   (a_h[1] || b_h[1]) && (a_h[0] || b_h[0]);
  assign #1 i4_l = (a_l[3] || b_l[3]) && (a_l[2] || b_l[2]) &&
		   (a_l[1] || b_l[1]) && (a_l[0] || b_l[0]);

  // Ripple short pseudo carry through 4 bit block

  assign #1 cout0_h = cin0_h && i4_h || h4_h;
  assign #1 cout0_l = cin0_l && i4_l || h4_l;
  assign #1 cout1_h = cin1_h && i4_h || h4_h;
  assign #1 cout1_l = cin1_l && i4_l || h4_l;

  // Compute real carry in to each 1 bit block assuming 0 or 1 
  //  pseudo carry into 4 bit block.
  //  This uses four 4-bit Manchester carry chains, one for each of
  //  carry_h and carry_l assuming cin of 0 and 1.  Therefore, the delay
  //  from p and g to carries1_h[3] is one Manchester carry chain (essentially
  //  a complex domino gate), rather than through four gates.
  //
  // carries0_h[b]: carry into bit b assuming pseudo-carry of 0 into 4 block
  // carries1_h[b]: carry into bit b assuming pseudo-carry of 1 into 4 block
  // carries0_l[b]: ~carry into bit b assuming pseudo-carry of 0 into 4 block
  // carries1_l[b]: ~carry into bit b assuming pseudo-carry of 1 into 4 block

  assign #1 carries0_h[0] = 0;
  assign #1 carries0_h[1] = carries0_h[0] && p[0] || g[0];
  assign #1 carries0_h[2] = carries0_h[1] && p[1] || g[1];
  assign #1 carries0_h[3] = carries0_h[2] && p[2] || g[2];
  assign #1 carries1_h[0] = a_h[0] || b_h[0];
  assign #1 carries1_h[1] = carries1_h[0] && p[0] || g[0];
  assign #1 carries1_h[2] = carries1_h[1] && p[1] || g[1];
  assign #1 carries1_h[3] = carries1_h[2] && p[2] || g[2];

  assign #1 carries0_l[0] = a_l[0] || b_l[0];
  assign #1 carries0_l[1] = carries0_l[0] && p[0] || k[0];
  assign #1 carries0_l[2] = carries0_l[1] && p[1] || k[1];
  assign #1 carries0_l[3] = carries0_l[2] && p[2] || k[2];
  assign #1 carries1_l[0] = 0;
  assign #1 carries1_l[1] = carries1_l[0] && p[0] || k[0];
  assign #1 carries1_l[2] = carries1_l[1] && p[1] || k[1];
  assign #1 carries1_l[3] = carries1_l[2] && p[2] || k[2];

  // Select real carry into 1 bit blocks when pseudo carry into 4 bit block 
  // arrives

  assign #1 c0_h = {4{cin0_h}} & carries1_h | {4{cin0_l}} & carries0_h;
  assign #1 c0_l = {4{cin0_h}} & carries1_l | {4{cin0_l}} & carries0_l;
  assign #1 c1_h = {4{cin1_h}} & carries1_h | {4{cin1_l}} & carries0_h;
  assign #1 c1_l = {4{cin1_h}} & carries1_l | {4{cin1_l}} & carries0_l;

endmodule

module adder1(a_h, a_l, b_h, b_l, r_h, r_l, 
	      longc_h, longc_l, c0_h, c0_l, c1_h, c1_l, p, g, k);

  // Interface
  input  	a_h, a_l;
  input   	b_h, b_l;
  output	r_h, r_l;
  input  	longc_h, longc_l; // carry in to 16 bit block
  input	  	c0_h, c0_l;	  // carry in to 1 bit block when longc_l = 1
  input	  	c1_h, c1_l;	  // carry in to 1 bit block when longc_h = 1
  output	p, g, k;	  // propagate, generate, kill

  // Compute generate and propagate and kill for single bit

  assign #1 g = a_h && b_h;	// generate = AB
  assign #1 p = a_h && b_l || a_l && b_h; // propagate = A XOR B (not A OR B)
  assign #1 k = a_l && b_l;	// kill = !A*!B

  // Compute true and complementary result (Figure 3)

  assign #1 r_h = (longc_h && (c1_h && (g || k) || c1_l && p)) ||
		  (longc_l && (c0_h && (g || k) || c0_l && p));
  assign #1 r_l = (longc_h && (c1_h && p || c1_l && (g || k))) ||
		  (longc_l && (c0_h && p || c0_l && (g || k)));

endmodule

///////////
// clock //
///////////
module clock(clk);
 
  // Interface
  output clk;
 
  // Internal clk signal
  reg clk;
 
  // Generate Clock with period = 66 FO4 inverter delays
  initial
    forever
      begin
        clk = 1; #33;
        clk = 0; #33;
      end
 
endmodule

//////////////
// stimulus //
//////////////
module stimulus(a, b, cin, clk);
 
  // Interface
  output [63:0] a;
  output [63:0] b;
  output        cin;
  input         clk;
 
  // Internal state nodes
  reg    [63:0] a;
  reg    [63:0] b;
  reg           cin;
  reg    [31:0] count;
  reg    [31:0] tmp;

  // Apply stimulus to input and verify output
  initial
    begin
      a = 0;
      b = 0;
      cin = 0;
/*      count = 0;
      while (count < 100000) 
   	begin
     	  tmp[31:0] =  $random;
      	  tmp[31:0] =  $random;
      	  tmp[31:0] =  $random;
      	  tmp[31:0] =  $random;
      	  tmp[0] =  $random;
 	  count = count+1;
	  if (count % 2000 == 0) $display(count);
	end
	$display("done initializing"); */
    end
 
  always @(posedge clk) 
    begin
      a[31:0] <= #1 $random;
      a[63:32] <= #1 $random;
      b[31:0] <= #1 $random;
      b[63:32] <= #1 $random;
      cin <= #1 $random;
    end
 
endmodule
 
/////////////
// checker //
/////////////
module checker(a, b, cin, r, clk);
 
  // Interface
  input  [63:0] a;
  input  [63:0] b;
  input         cin;
  output [63:0] r;
  input         clk;
 
  // Internal checker state
  reg    [31:0] checks;
  reg    [31:0] failures;
 
  initial
    begin
      checks <= #1 0;
      failures <= #1 0;
      $display("Beginning random stimulus testing of 64 bit adder");
    end
 
  always @(negedge clk)
    begin
 
      // Check and report errors
      if ((a + b + cin) != r)
        begin
          $display("\n*** Error ***  %h + %h + %b -> %h\n  (should be %h)",
                    a, b, cin, r, a+b+cin);
          failures = failures + 1;
	  $stop;
        end
 
      // track how many checks have been completed
      checks <= #1 checks+1;
      if (checks % 100 == 0 && checks > 0) 
        $display("%d checks completed; %d failures found\n",checks,failures);
 
//      $display ("A = %h B = %h cin = %b r= %h", a, b, cin, r);

      if (checks == 1000000) begin
	if (failures == 0) $display("Adder Passes Random Testing.");
	else $display("Adder Fails Random Testing.");
	$finish;
      end

    end
 
  // Display the results graphically
  //initial
  //    $gr_waves("A", a, "B" ,b , "cin", 1, "R", r);
 
endmodule