/////////////////////////////////////////////////////// // ling.v // // // // Written 10/16/96 by David Harris harrisd@leland // // EE371 Problem Set 4 // // // // Corrected 10/23/96: fixed C16 logic // // Updated 11/8/96: changed for symmetric _h, _l // // // // This file models a 64 bit Ling adder based on // // the 1996 ISSCC paper (SP22.5) by S. Naffziger. // // // /////////////////////////////////////////////////////// // This Verilog file contains a model of the 64 bit Ling adder // as best as could be constructed from the ISSCC paper. It // also contains instrumentation to generate random test vectors // and check the results. If the adder passes a million // random tests, it probably is logically correct (or at least has // a pretty subtle bug!) // // The paper is pretty good, but, like most papers, has some // errors and some important omissions. As far as I can tell: // // The "Short Carry Generate" circuit in the slide supplement // disagrees with Figure 2 from the Digest of Technical Papers. // I believe the slides are correct (there is no G0 term). // // C16 is written as (H0+I0)*I1*I2*I3 + H1*I2*I3 + H2*I3 + H3 // in the Digest, but as H3 + H2*I3 + H1*I2*I3 + H0*I1*I2*I3 // in the slide supplement. Both are wrong and both are correct. // The I0 term is required for the low-order 16 bit block to // incorporate the adder's cin term. The I0 term is left out // of the other 16 bit blocks. // // Another key point mentioned in correspondence with the author // but only cryptically in the paper is that I4 is not P0*P1*P2*P3 as one might // expect, but rather P-1*P0*P1*P2, where P-1 is the propagate // from the previous block of 4 (or the carry in to the very first // block). This is nonobvious, but causes the carry chain to work // out nicely. // // Finally, there is a P term missing from the equations for true // carries in. For example, Long_C (the carry in to a 16 bit block) // is defined for the high 16 bits to be C48 + C32*P48 + C16*P32*P48. // The real carry in to the high 16 bits is Long_C*P47, where P47 is // A47 + B47. This propagate is factored into the short carry chain // instead. // // This model doesn't strictly match the ISSCC paper for the "short carry // ripple" logic. The model propagates 4-bit pseudo-carries through three // domino gates, then trips a mux which selects the 1-bit real carries // which have been computed assuming pseudo-carry of 0 or 1 into the // 4 bit block. Sam somehow propagates real 4 bit carries using his C3 // term; I am not entirely clear on the logic. // // All of the circuits are implemented with domino. Instead of using // "true" dual rail domino where _h and _l signals are complementary, // _h and _l are redefined to mean "applies to a 1" and "applies to a 0". // For example I_h means the propagate for a 1, while I_l means the // propagate for a 0. H_h means that a 1 is (pseudo)generated, while H_l // means that a 0 is pseudogenerated, not that a 1 is not generated. // The adder logic has interesting self-dual properties which make everything // work out so that the carries are produced properly. Moreover, the // same kind of gate can now be used for _h and _l which makes layout and // circuit design simpler. // /////////////////////////////// // Top Level Instrumentation // /////////////////////////////// module top(); // Input and output busses wire [63:0] a; wire [63:0] b; wire cin; wire [63:0] r_h, r_l; wire clk; // Instantiate adder adder64 adder(a, ~a, b, ~b, cin, ~cin, r_h, r_l); // Clock generator clock clockgen(clk); // Instrumentation to test adder stimulus stimgen(a, b, cin, clk); checker stimchk(a, b, cin, r_h, clk); endmodule ///////////// // adder64 // ///////////// module adder64(a_h, a_l, b_h, b_l, cin_h, cin_l, r_h, r_l); // A 64 bit ling adder based on Naffziger, ISSCC 96 // Interface (neglect fact adder is clocked) input [63:0] a_h, a_l; input [63:0] b_h, b_l; input cin_h, cin_l; output [63:0] r_h, r_l; // Internal signals wire p16_h, p32_h, p48_h, p64_h; // propagate out of 16 bit blocks wire p16_l, p32_l, p48_l, p64_l; wire c16_h, c32_h, c48_h, c64_h; // carries out of 16 bit blocks wire c16_l, c32_l, c48_l, c64_l; wire long_c16_h, long_c32_h, long_c48_h; // carries into 16 bit blks wire long_c16_l, long_c32_l, long_c48_l; // carries into 16 bit blks // Sub-blocks // 1'b0 indicates the 1-digit binary value 0 in Verilog // the low-order 16 bit block is slightly different because its c16 logic // factors in the cin term. adder16low adder16_0({a_h[15:0], cin_h}, {a_l[15:0], cin_l}, {b_h[15:0], cin_h}, {b_l[15:0], cin_l}, r_h[15:0], r_l[15:0], c16_h, c16_l, p16_h, p16_l, cin_h, cin_l); adder16 adder16_1(a_h[31:15], a_l[31:15], b_h[31:15], b_l[31:15], r_h[31:16], r_l[31:16], c32_h, c32_l, p32_h, p32_l, long_c16_h, long_c16_l); adder16 adder16_2(a_h[47:31], a_l[47:31], b_h[47:31], b_l[47:31], r_h[47:32], r_l[47:32], c48_h, c48_l, p48_h, p48_l, long_c32_h, long_c32_l); adder16 adder16_3(a_h[63:47], a_l[63:47], b_h[63:47], b_l[63:47], r_h[63:48], r_l[63:48], c64_h, c64_l, p64_h, p64_l, long_c48_h, long_c48_l); // Compute long carries into 16 bit blocks assign #1 long_c48_h = c48_h || p48_h && (c32_h || p32_h && c16_h); assign #1 long_c32_h = c32_h || p32_h && c16_h; assign #1 long_c16_h = c16_h; assign #1 long_c48_l = c48_l || p48_l && (c32_l || p32_l && c16_l); assign #1 long_c32_l = c32_l || p32_l && c16_l; assign #1 long_c16_l = c16_l; endmodule module adder16(a_h, a_l, b_h, b_l, r_h, r_l, c16_h, c16_l, p16_h, p16_l, longc_h, longc_l); // Interface input [16:0] a_h, a_l; // really bits [15:-1] input [16:0] b_h, b_l; // really bits [15:-1] output [15:0] r_h, r_l; output c16_h, c16_l; // carry produced by 16 bit block output p16_h, p16_l; // 16 bit propagate input longc_h, longc_l; // carry in to 16 bit block // Internal signals wire [3:0] h_h, h_l; // H of each 4 bit subblock wire [3:0] i_h, i_l; // I of each 4 bit subblock wire [4:0] cin0_h, cin0_l; // Carry in to 4 bit block if longc_l = 1 wire [4:0] cin1_h, cin1_l; // Carry in to 4 bit block if longc_h = 0 // Sub-blocks adder4 adder4_0(a_h[4:0], a_l[4:0], b_h[4:0], b_l[4:0], r_h[3:0], r_l[3:0], h_h[0], h_l[0], i_h[0], i_l[0], cin0_h[0], cin0_l[0], cin1_h[0], cin1_l[0], cin0_h[1], cin0_l[1], cin1_h[1], cin1_l[1], longc_h, longc_l); adder4 adder4_1(a_h[8:4], a_l[8:4], b_h[8:4], b_l[8:4], r_h[7:4], r_l[7:4], h_h[1], h_l[1], i_h[1], i_l[1], cin0_h[1], cin0_l[1], cin1_h[1], cin1_l[1], cin0_h[2], cin0_l[2], cin1_h[2], cin1_l[2], longc_h, longc_l); adder4 adder4_2(a_h[12:8], a_l[12:8], b_h[12:8], b_l[12:8], r_h[11:8], r_l[11:8], h_h[2], h_l[2], i_h[2], i_l[2], cin0_h[2], cin0_l[2], cin1_h[2], cin1_l[2], cin0_h[3], cin0_l[3], cin1_h[3], cin1_l[3], longc_h, longc_l); adder4 adder4_3(a_h[16:12], a_l[16:12], b_h[16:12], b_l[16:12], r_h[15:12], r_l[15:12], h_h[3], h_l[3], i_h[3], i_l[3], cin0_h[3], cin0_l[3], cin1_h[3], cin1_l[3], cin0_h[4], cin0_l[4], cin1_h[4], cin1_l[4], longc_h, longc_l); // Compute 16 bit C and P assign #1 c16_h = h_h[3] || i_h[3] && (h_h[2] || i_h[2] && (h_h[1] || i_h[1] && h_h[0])); assign #1 c16_l = h_l[3] || i_l[3] && (h_l[2] || i_l[2] && (h_l[1] || i_l[1] && h_l[0])); assign #1 p16_h = i_h[3] && i_h[2] && i_h[1] && i_h[0]; assign #1 p16_l = i_l[3] && i_l[2] && i_l[1] && i_l[0]; // Set cin to low-order 4 bits // Remember that the true carry in to a 16 bit block = cin*Pm where // Pm is the propagate through the most significant bit of the // previous block. assign #1 cin0_h[0] = 0; assign #1 cin1_h[0] = 1; assign #1 cin0_l[0] = 1; assign #1 cin1_l[0] = 0; endmodule module adder16low(a_h, a_l, b_h, b_l, r_h, r_l, c16_h, c16_l, p16_h, p16_l, longc_h, longc_l); // This block is identical to adder16 except the c16 logic // is modififed to factor in the cin to the 64 bit adder. // Interface input [16:0] a_h, a_l; // really bits [15:-1] input [16:0] b_h, b_l; // really bits [15:-1] output [15:0] r_h, r_l; output c16_h, c16_l; // carry produced by 16 bit block output p16_h, p16_l; // 16 bit propagate input longc_h, longc_l; // carry in to 16 bit block // Internal signals wire [3:0] h_h, h_l; // H of each 4 bit subblock wire [3:0] i_h, i_l; // I of each 4 bit subblock wire [4:0] cin0_h, cin0_l; // Carry in to 4 bit block if longc_l = 1 wire [4:0] cin1_h, cin1_l; // Carry in to 4 bit block if longc_h = 0 // Sub-blocks adder4 adder4_0(a_h[4:0], a_l[4:0], b_h[4:0], b_l[4:0], r_h[3:0], r_l[3:0], h_h[0], h_l[0], i_h[0], i_l[0], cin0_h[0], cin0_l[0], cin1_h[0], cin1_l[0], cin0_h[1], cin0_l[1], cin1_h[1], cin1_l[1], longc_h, longc_l); adder4 adder4_1(a_h[8:4], a_l[8:4], b_h[8:4], b_l[8:4], r_h[7:4], r_l[7:4], h_h[1], h_l[1], i_h[1], i_l[1], cin0_h[1], cin0_l[1], cin1_h[1], cin1_l[1], cin0_h[2], cin0_l[2], cin1_h[2], cin1_l[2], longc_h, longc_l); adder4 adder4_2(a_h[12:8], a_l[12:8], b_h[12:8], b_l[12:8], r_h[11:8], r_l[11:8], h_h[2], h_l[2], i_h[2], i_l[2], cin0_h[2], cin0_l[2], cin1_h[2], cin1_l[2], cin0_h[3], cin0_l[3], cin1_h[3], cin1_l[3], longc_h, longc_l); adder4 adder4_3(a_h[16:12], a_l[16:12], b_h[16:12], b_l[16:12], r_h[15:12], r_l[15:12], h_h[3], h_l[3], i_h[3], i_l[3], cin0_h[3], cin0_l[3], cin1_h[3], cin1_l[3], cin0_h[4], cin0_l[4], cin1_h[4], cin1_l[4], longc_h, longc_l); // Compute 16 bit C and P assign #1 c16_h = h_h[3] || i_h[3] && (h_h[2] || i_h[2] && (h_h[1] || i_h[1] && (h_h[0] || i_h[0]))); assign #1 c16_l = h_l[3] || i_l[3] && (h_l[2] || i_l[2] && (h_l[1] || i_l[1] && (h_l[0] || i_l[0]))); assign #1 p16_h = i_h[3] && i_h[2] && i_h[1] && i_h[0]; assign #1 p16_l = i_l[3] && i_l[2] && i_l[1] && i_l[0]; // Set cin to low-order 4 bits // Remember that the true carry in to a 16 bit block = cin*Pm where // Pm is the propagate through the most significant bit of the // previous block. assign #1 cin0_h[0] = 0; assign #1 cin1_h[0] = 1; assign #1 cin0_l[0] = 1; assign #1 cin1_l[0] = 0; endmodule module adder4(a_h, a_l, b_h, b_l, r_h, r_l, h4_h, h4_l, i4_h, i4_l, cin0_h, cin0_l, cin1_h, cin1_l, cout0_h, cout0_l, cout1_h, cout1_l, longc_h, longc_l); // Interface input [4:0] a_h, a_l; // really bits [3:-1] input [4:0] b_h, b_l; // really bits [3:-1] output [3:0] r_h, r_l; output h4_h, h4_l; // pseudo-carry for bits 3:0 output i4_h, i4_l; // psuedo-propagate (for bits 2:-1) input cin0_h, cin0_l; // carry in to 4-bit block given longc_l input cin1_h, cin1_l; // carry in to 4-bit block given longc_h output cout0_h, cout0_l; // carry out of 4-bit block given longc_l output cout1_h, cout1_l; // carry out of 4-bit block given longc_h input longc_h, longc_l; // carry in to 16 bit block // Internal signals wire [3:0] c0_h, c0_l; // carry in to 1-bit block given longc_l wire [3:0] c1_h, c1_l; // carry in to 1-bit block given longc_h wire [3:0] carries0_h, carries0_l; // speculative 1 bit carries if cin0=1 wire [3:0] carries1_h, carries1_l; // speculative 1 bit carries if cin1=1 wire [3:0] p, g, k; // P, G, K from 1 bit blocks // Sub-blocks adder1 adder1_0(a_h[1], a_l[1], b_h[1], b_l[1], r_h[0], r_l[0], longc_h, longc_l, c0_h[0], c0_l[0], c1_h[0], c1_l[0], p[0], g[0], k[0]); adder1 adder1_1(a_h[2], a_l[2], b_h[2], b_l[2], r_h[1], r_l[1], longc_h, longc_l, c0_h[1], c0_l[1], c1_h[1], c1_l[1], p[1], g[1], k[1]); adder1 adder1_2(a_h[3], a_l[3], b_h[3], b_l[3], r_h[2], r_l[2], longc_h, longc_l, c0_h[2], c0_l[2], c1_h[2], c1_l[2], p[2], g[2], k[2]); adder1 adder1_3(a_h[4], a_l[4], b_h[4], b_l[4], r_h[3], r_l[3], longc_h, longc_l, c0_h[3], c0_l[3], c1_h[3], c1_l[3], p[3], g[3], k[3]); // Compute pseudo-generate & propagate // Note: the H term is for bits [3:0], while the // I term is for bits [2:-1]. This seems funny // at first, but is helpful for carry propagation. assign #1 h4_h = a_h[4] && b_h[4] || a_h[3] && b_h[3] || a_h[2] && b_h[2] && (a_h[3] || b_h[3]) || a_h[1] && b_h[1] && (a_h[2] || b_h[2]) && (a_h[3] || b_h[3]); assign #1 h4_l = a_l[4] && b_l[4] || a_l[3] && b_l[3] || a_l[2] && b_l[2] && (a_l[3] || b_l[3]) || a_l[1] && b_l[1] && (a_l[2] || b_l[2]) && (a_l[3] || b_l[3]); assign #1 i4_h = (a_h[3] || b_h[3]) && (a_h[2] || b_h[2]) && (a_h[1] || b_h[1]) && (a_h[0] || b_h[0]); assign #1 i4_l = (a_l[3] || b_l[3]) && (a_l[2] || b_l[2]) && (a_l[1] || b_l[1]) && (a_l[0] || b_l[0]); // Ripple short pseudo carry through 4 bit block assign #1 cout0_h = cin0_h && i4_h || h4_h; assign #1 cout0_l = cin0_l && i4_l || h4_l; assign #1 cout1_h = cin1_h && i4_h || h4_h; assign #1 cout1_l = cin1_l && i4_l || h4_l; // Compute real carry in to each 1 bit block assuming 0 or 1 // pseudo carry into 4 bit block. // This uses four 4-bit Manchester carry chains, one for each of // carry_h and carry_l assuming cin of 0 and 1. Therefore, the delay // from p and g to carries1_h[3] is one Manchester carry chain (essentially // a complex domino gate), rather than through four gates. // // carries0_h[b]: carry into bit b assuming pseudo-carry of 0 into 4 block // carries1_h[b]: carry into bit b assuming pseudo-carry of 1 into 4 block // carries0_l[b]: ~carry into bit b assuming pseudo-carry of 0 into 4 block // carries1_l[b]: ~carry into bit b assuming pseudo-carry of 1 into 4 block assign #1 carries0_h[0] = 0; assign #1 carries0_h[1] = carries0_h[0] && p[0] || g[0]; assign #1 carries0_h[2] = carries0_h[1] && p[1] || g[1]; assign #1 carries0_h[3] = carries0_h[2] && p[2] || g[2]; assign #1 carries1_h[0] = a_h[0] || b_h[0]; assign #1 carries1_h[1] = carries1_h[0] && p[0] || g[0]; assign #1 carries1_h[2] = carries1_h[1] && p[1] || g[1]; assign #1 carries1_h[3] = carries1_h[2] && p[2] || g[2]; assign #1 carries0_l[0] = a_l[0] || b_l[0]; assign #1 carries0_l[1] = carries0_l[0] && p[0] || k[0]; assign #1 carries0_l[2] = carries0_l[1] && p[1] || k[1]; assign #1 carries0_l[3] = carries0_l[2] && p[2] || k[2]; assign #1 carries1_l[0] = 0; assign #1 carries1_l[1] = carries1_l[0] && p[0] || k[0]; assign #1 carries1_l[2] = carries1_l[1] && p[1] || k[1]; assign #1 carries1_l[3] = carries1_l[2] && p[2] || k[2]; // Select real carry into 1 bit blocks when pseudo carry into 4 bit block // arrives assign #1 c0_h = {4{cin0_h}} & carries1_h | {4{cin0_l}} & carries0_h; assign #1 c0_l = {4{cin0_h}} & carries1_l | {4{cin0_l}} & carries0_l; assign #1 c1_h = {4{cin1_h}} & carries1_h | {4{cin1_l}} & carries0_h; assign #1 c1_l = {4{cin1_h}} & carries1_l | {4{cin1_l}} & carries0_l; endmodule module adder1(a_h, a_l, b_h, b_l, r_h, r_l, longc_h, longc_l, c0_h, c0_l, c1_h, c1_l, p, g, k); // Interface input a_h, a_l; input b_h, b_l; output r_h, r_l; input longc_h, longc_l; // carry in to 16 bit block input c0_h, c0_l; // carry in to 1 bit block when longc_l = 1 input c1_h, c1_l; // carry in to 1 bit block when longc_h = 1 output p, g, k; // propagate, generate, kill // Compute generate and propagate and kill for single bit assign #1 g = a_h && b_h; // generate = AB assign #1 p = a_h && b_l || a_l && b_h; // propagate = A XOR B (not A OR B) assign #1 k = a_l && b_l; // kill = !A*!B // Compute true and complementary result (Figure 3) assign #1 r_h = (longc_h && (c1_h && (g || k) || c1_l && p)) || (longc_l && (c0_h && (g || k) || c0_l && p)); assign #1 r_l = (longc_h && (c1_h && p || c1_l && (g || k))) || (longc_l && (c0_h && p || c0_l && (g || k))); endmodule /////////// // clock // /////////// module clock(clk); // Interface output clk; // Internal clk signal reg clk; // Generate Clock with period = 66 FO4 inverter delays initial forever begin clk = 1; #33; clk = 0; #33; end endmodule ////////////// // stimulus // ////////////// module stimulus(a, b, cin, clk); // Interface output [63:0] a; output [63:0] b; output cin; input clk; // Internal state nodes reg [63:0] a; reg [63:0] b; reg cin; reg [31:0] count; reg [31:0] tmp; // Apply stimulus to input and verify output initial begin a = 0; b = 0; cin = 0; /* count = 0; while (count < 100000) begin tmp[31:0] = $random; tmp[31:0] = $random; tmp[31:0] = $random; tmp[31:0] = $random; tmp[0] = $random; count = count+1; if (count % 2000 == 0) $display(count); end $display("done initializing"); */ end always @(posedge clk) begin a[31:0] <= #1 $random; a[63:32] <= #1 $random; b[31:0] <= #1 $random; b[63:32] <= #1 $random; cin <= #1 $random; end endmodule ///////////// // checker // ///////////// module checker(a, b, cin, r, clk); // Interface input [63:0] a; input [63:0] b; input cin; output [63:0] r; input clk; // Internal checker state reg [31:0] checks; reg [31:0] failures; initial begin checks <= #1 0; failures <= #1 0; $display("Beginning random stimulus testing of 64 bit adder"); end always @(negedge clk) begin // Check and report errors if ((a + b + cin) != r) begin $display("\n*** Error *** %h + %h + %b -> %h\n (should be %h)", a, b, cin, r, a+b+cin); failures = failures + 1; $stop; end // track how many checks have been completed checks <= #1 checks+1; if (checks % 100 == 0 && checks > 0) $display("%d checks completed; %d failures found\n",checks,failures); // $display ("A = %h B = %h cin = %b r= %h", a, b, cin, r); if (checks == 1000000) begin if (failures == 0) $display("Adder Passes Random Testing."); else $display("Adder Fails Random Testing."); $finish; end end // Display the results graphically //initial // $gr_waves("A", a, "B" ,b , "cin", 1, "R", r); endmodule