/************************************************************************* * * This file is part of ACT dataflow neuro library * * Copyright (c) 2022 University of Groningen - Ole Richter * Copyright (c) 2022 University of Groningen - Michele Mastella * Copyright (c) 2022 University of Groningen - Hugh Greatorex * Copyright (c) 2022 University of Groningen - Madison Cotteret * * * This source describes Open Hardware and is licensed under the CERN-OHL-W v2 or later * * You may redistribute and modify this documentation and make products * using it under the terms of the CERN-OHL-W v2 (https:/cern.ch/cern-ohl). * This documentation is distributed WITHOUT ANY EXPRESS OR IMPLIED * WARRANTY, INCLUDING OF MERCHANTABILITY, SATISFACTORY QUALITY * AND FITNESS FOR A PARTICULAR PURPOSE. Please see the CERN-OHL-W v2 * for applicable conditions. * * Source location: https://git.web.rug.nl/bics/actlib_dataflow_neuro * * As per CERN-OHL-W v2 section 4.1, should You produce hardware based on * these sources, You must maintain the Source Location visible in its * documentation. * ************************************************************************** */ import "../../dataflow_neuro/cell_lib_async.act"; import "../../dataflow_neuro/cell_lib_std.act"; import "../../dataflow_neuro/treegates.act"; import "../../dataflow_neuro/primitives.act"; // import tmpl::dataflow_neuro; // import tmpl::dataflow_neuro; import std::channel; open std::channel; // import std::func; open std; import std::data; open std::data; // import dev::channel; // open dev::channel; namespace tmpl { namespace dataflow_neuro { /** * Dualrail decoder. * Nc is the number of dualrail input channels. * Then builds N output AND gates, connecting to the right input wires. */ export template defproc decoder_dualrail (Mx1of2 in; bool? out[N]; power supply) { // signal buffers sigbuf in_tX[Nc]; sigbuf in_fX[Nc]; (i:Nc: in_tX[i].supply = supply; in_tX[i].in = in.d[i].t; in_fX[i].supply = supply; in_fX[i].in = in.d[i].f; ) // AND trees pint bitval; andtree atree[N]; (k:0..N-1:atree[k].supply = supply;) (i:0..N-1: (j:0..Nc-1: bitval = (i & ( 1 << j )) >> j; // Get binary digit of integer i, column j [bitval = 1 -> atree[i].in[j] = in_tX[j].out[i]; // atree[i].in[j] = addr_buf.out.d.d[j].t; []bitval = 0 -> atree[i].in[j] = in_fX[j].out[i]; // atree[i].in[j] = addr_buf.out.d.d[j].f; []bitval >= 2 -> {false : "fuck"}; ] atree[i].out = out[i]; ) ) } /** * Dualrail decoder with buffered outputs. * Be careful of out[] indexing. */ export template defproc decoder_dualrailX(Mx1of2 in; bool? out[N*OUT_STRENGTH]; power supply) { decoder_dualrail decoder(.in = in, .supply = supply); sigbuf sb[N]; (i:N: sb[i].in = decoder.out[i]; sb[i].supply = supply; (j:OUT_STRENGTH: sb[i].out[j] = out[j + i*OUT_STRENGTH]; ) ) } /** * 2D decoder which uses a configurable delay from the VCtrees to buffer ack. * Nx is the x size of the decoder array * NxC is the number of wires in the x channel. * Thus NxC should be something like NxC = ceil(log2(Nx)) * but my guess is that we can't do logs... * N_dly_cfg is the number of config bits in the ACK delay line, * with all bits high corresponding to 2**N_dly_cfg -1 DLY4_X1 cells. */ export template defproc decoder_2d_dly (avMx1of2 in; bool? outx[Nx], outy[Ny], dly_cfg[N_dly_cfg], reset_B; power supply) { // Buffer to recieve concat(x,y) address packet buffer addr_buf(.in = in, .reset_B = reset_B, .supply = supply); // Validity trees vtree vtree_x (.supply = supply); vtree vtree_y (.supply = supply); (i:0..NxC-1:vtree_x.in.d[i].t = addr_buf.out.d.d[i].t;) (i:0..NxC-1:vtree_x.in.d[i].f = addr_buf.out.d.d[i].f;) (i:0..NyC-1:vtree_y.in.d[i].t = addr_buf.out.d.d[i+NxC].t;) (i:0..NyC-1:vtree_y.in.d[i].f = addr_buf.out.d.d[i+NxC].f;) // Delay ack line. Ack line is delayed (but not the val) A_2C_B_X1 C2el(.c1 = vtree_x.out, .c2 = vtree_y.out, .vdd = supply.vdd, .vss = supply.vss); addr_buf.out.v = C2el.y; delayprog dly(.in = C2el.y, .s = dly_cfg, .supply = supply); dly.out = addr_buf.out.a; // Decoder X/Y And trees decoder_dualrail d_dr_x(.out = outx, .supply = supply); (i:0..NxC-1:d_dr_x.in.d[i] = addr_buf.out.d.d[i];) decoder_dualrail d_dr_y(.out = outy, .supply = supply); (i:0..NyC-1:d_dr_y.in.d[i] = addr_buf.out.d.d[i+NxC];) } export template defproc and_grid(bool! out[Nx*Ny]; bool? inx[Nx], iny[Ny]; power supply) { // Buffer inputs sigbuf xbuf[Nx]; sigbuf ybuf[Ny]; (i:Nx: xbuf[i].in = inx[i]; xbuf[i].supply = supply; ) (i:Ny: ybuf[i].in = iny[i]; ybuf[i].supply = supply; ) AND2_X1 ands[Nx*Ny]; (i:0..Nx*Ny-1:ands[i].vss = supply.vss; ands[i].vdd = supply.vdd;) (x:0..Nx-1: (y:0..Ny-1: ands[x + y*Nx].a = xbuf[x].out[y]; ands[x + y*Nx].b = ybuf[y].out[x]; ands[x + y*Nx].y = out[x + y*Nx]; ) ) } /** * 2D decoder which uses synapse handshaking using line pulldowns. * Nx is the x size of the decoder array * NxC is the number of wires in the x channel. * but my guess is that we can't do logs... * the req on a1of1 out is the req to each synapse. * The ack back from each line should go high when the synapse is charged. * N_dly is a hard coded delay of the pull down circuit. * It can be set to 0. */ export template defproc decoder_2d_hs (avMx1of2 in; a1of1 out[Nx*Ny]; bool? reset_B; power supply) { // Buffer to recieve concat(x,y) address packet buffer addr_buf(.in = in, .reset_B = reset_B, .supply = supply); // Decoder X/Y And trees decoder_dualrail d_dr_x(.supply = supply); (i:0..NxC-1:d_dr_x.in.d[i] = addr_buf.out.d.d[i];) decoder_dualrail d_dr_y(.supply = supply); (i:0..NyC-1:d_dr_y.in.d[i] = addr_buf.out.d.d[i+NxC];) // sig buf for reqx lines, since they go to synapse pull down gates. sigbuf d_dr_xX[Nx]; (i:Nx: d_dr_xX[i].in = d_dr_x.out[i]; d_dr_xX[i].supply = supply; ) // Validity vtree vtree_x (.supply = supply); vtree vtree_y (.supply = supply); (i:0..NxC-1:vtree_x.in.d[i].t = addr_buf.out.d.d[i].t;) (i:0..NxC-1:vtree_x.in.d[i].f = addr_buf.out.d.d[i].f;) (i:0..NyC-1:vtree_y.in.d[i].t = addr_buf.out.d.d[i+NxC].t;) (i:0..NyC-1:vtree_y.in.d[i].f = addr_buf.out.d.d[i+NxC].f;) A_2C_B_X1 valid_Cel(.c1 = vtree_x.out, .c2 = vtree_y.out, .y = addr_buf.out.v, .vdd = supply.vdd, .vss = supply.vss); // and grid for reqs into synapses and_grid _and_grid(.inx = d_dr_x.out, .iny = d_dr_y.out, .supply = supply); (i:Nx*Ny: out[i].r = _and_grid.out[i];) // Acknowledge pull down time // Pull DOWNs on the ackB lines by synapses (easier to invert). bool _out_acksB[Nx]; // The vertical output ack lines from each syn. A_2N_U_X4 ack_pulldowns[Nx*Ny]; pint index; (i:Nx: (j:Ny: index = i + Nx*j; ack_pulldowns[index].a = out[index].a; ack_pulldowns[index].b = d_dr_xX[i].out[j]; ack_pulldowns[index].y = _out_acksB[i]; ack_pulldowns[index].vss = supply.vss; ack_pulldowns[index].vdd = supply.vdd; ) ) // Line end pull UPs (triggered once reqs removed) // Use two pullups rather than and-pullup // bc smaller // and bc the delay that an AND induces means that the pullup could // end up fighting a synapse pulldown, as both have the correct req sigs. A_1P_U_X4 pu[Nx]; // TODO probably replace this with variable strength PU A_1P_U_X4 pu_reset[Nx]; (i:Nx: pu[i].a = d_dr_xX[i].out[Ny]; pu[i].y = _out_acksB[i]; pu[i].vdd = supply.vdd; pu[i].vss = supply.vss; pu_reset[i].a = reset_B; pu_reset[i].y = _out_acksB[i]; pu_reset[i].vdd = supply.vdd; pu_reset[i].vss = supply.vss; ) // ORtree from all output acks, back to the buffer ack. // This is instead of the ack that came from the delayed validity trees, // in decoder_2d_dly. ortree _ortree(.supply = supply); INV_X1 out_ack_invs[Nx]; (i:Nx: out_ack_invs[i].a = _out_acksB[i]; out_ack_invs[i].vdd = supply.vdd; out_ack_invs[i].vss = supply.vss; _ortree.in[i] = out_ack_invs[i].y; ) // C element to ensure that the buffer receives an invalid // _only_ once _both_ ackB has been reset, _and_ its output data // has been fully invalidated. // Otherwise run into the issue that ack is removed before data is invalid. A_2C_B_X1 buf_ack_Cel(.c1 = _ortree.out, .c2 = valid_Cel.y, .y = addr_buf.out.a, .vdd = supply.vdd, .vss = supply.vss); } /** * 2D decoder which uses either synapse handshaking, or just a delay. * Controlled by the "hs_en" (handshake_enable) config bit. * hs_en = 0 -> use delayed version. * hs_en = 1 -> use synapse handshaking. * Regardless of which version is used, the final ack going to the buffer * goes through the prog_delay block. * Thus, for the handshaking version to be used "correctly", * dly_cfg should be set to all zeros. */ export template defproc decoder_2d_hybrid (avMx1of2 in; a1of1 out[Nx*Ny]; bool? dly_cfg[N_dly_cfg], hs_en, reset_B; power supply) { bool hs_enB; INV_X4 hs_inv(.a = hs_en, .y = hs_enB, .vdd = supply.vdd, .vss = supply.vss); // Buffer to recieve concat(x,y) address packet buffer addr_buf(.in = in, .reset_B = reset_B, .supply = supply); // Decoder X/Y And trees decoder_dualrail d_dr_x(.supply = supply); (i:0..NxC-1:d_dr_x.in.d[i] = addr_buf.out.d.d[i];) decoder_dualrail d_dr_y(.supply = supply); (i:0..NyC-1:d_dr_y.in.d[i] = addr_buf.out.d.d[i+NxC];) // sig buf for reqx lines, since they go to synapse pull down gates. sigbuf d_dr_xX[Nx]; (i:Nx: d_dr_xX[i].in = d_dr_x.out[i]; d_dr_xX[i].supply = supply; ) // Validity vtree vtree_x (.supply = supply); vtree vtree_y (.supply = supply); (i:0..NxC-1:vtree_x.in.d[i].t = addr_buf.out.d.d[i].t;) (i:0..NxC-1:vtree_x.in.d[i].f = addr_buf.out.d.d[i].f;) (i:0..NyC-1:vtree_y.in.d[i].t = addr_buf.out.d.d[i+NxC].t;) (i:0..NyC-1:vtree_y.in.d[i].f = addr_buf.out.d.d[i+NxC].f;) A_2C_B_X1 valid_Cel(.c1 = vtree_x.out, .c2 = vtree_y.out, .y = addr_buf.out.v, .vdd = supply.vdd, .vss = supply.vss); // and grid for reqs into synapses and_grid _and_grid(.inx = d_dr_x.out, .iny = d_dr_y.out, .supply = supply); (i:Nx*Ny: out[i].r = _and_grid.out[i];) // Acknowledge pull down time // Pull DOWNs on the ackB lines by synapses (easier to invert). bool _out_acksB[Nx]; // The vertical output ack lines from each syn. A_2N_U_X4 ack_pulldowns[Nx*Ny]; pint index; (i:Nx: (j:Ny: index = i + Nx*j; ack_pulldowns[index].a = out[index].a; ack_pulldowns[index].b = d_dr_xX[i].out[j]; ack_pulldowns[index].y = _out_acksB[i]; ack_pulldowns[index].vss = supply.vss; ack_pulldowns[index].vdd = supply.vdd; ) ) // Line end pull UPs (triggered once reqs removed) // Use two pullups rather than and-pullup // bc smaller // and bc the delay that an AND induces means that the pullup could // end up fighting a synapse pulldown, as both have the correct req sigs. A_2P_U_X4 pu[Nx]; // TODO probably replace this with variable strength PU A_1P_U_X4 pu_reset[Nx]; (i:Nx: pu[i].a = d_dr_xX[i].out[Ny]; pu[i].b = hs_enB; pu[i].y = _out_acksB[i]; pu[i].vdd = supply.vdd; pu[i].vss = supply.vss; pu_reset[i].a = reset_B; pu_reset[i].y = _out_acksB[i]; pu_reset[i].vdd = supply.vdd; pu_reset[i].vss = supply.vss; ) // Add keeps (currently don't do anything in ACT) KEEP_X1 keeps[Nx]; (i:Nx: keeps[i].vdd = supply.vdd; keeps[i].vss = supply.vss; keeps[i].y = _out_acksB[i]; ) // ORtree from all output acks, back to the buffer ack. // This is instead of the ack that came from the delayed validity trees, // in decoder_2d_dly. ortree _ortree(.supply = supply); INV_X1 out_ack_invs[Nx]; (i:Nx: out_ack_invs[i].a = _out_acksB[i]; out_ack_invs[i].vdd = supply.vdd; out_ack_invs[i].vss = supply.vss; _ortree.in[i] = out_ack_invs[i].y; ) // C element to ensure that the buffer receives an invalid // _only_ once _both_ ackB has been reset, _and_ its output data // has been fully invalidated. // Otherwise run into the issue that ack is removed before data is invalid. A_2C_B_X1 buf_ack_Cel(.c1 = _ortree.out, .c2 = valid_Cel.y, .vdd = supply.vdd, .vss = supply.vss); // Mux to switch between acks from handshake or delay MUX2_X1 ack_mux(.s = hs_en, .a = valid_Cel.y, .b = buf_ack_Cel.y, .vdd = supply.vdd, .vss = supply.vss); // Programmable delay delayprog dly(.in = ack_mux.y, .out = addr_buf.out.a, .s = dly_cfg, .supply = supply); } /* * Build an arbiter_handshake tree. */ export template defproc arbtree (a1of1 in[N]; a1of1 out; power supply) { bool tout; { N > 0 : "What?" }; pint i, end, j; i = 0; end = N-1; pint arbCount; arbCount = 0; /* Pre"calculate" the number of C cells required, look below if confused */ *[ i != end -> j = 0; *[ i <= end -> j = j + 1; [i = end -> i = end+1; [] i+1 = end -> i = end+1; arbCount = arbCount +1; [] else -> i = i + 2; arbCount = arbCount +1; ] ] /*-- update range that has to be combined --*/ // i = end+1; end = end+j; ] /* array that holds ALL the nodes in the completion tree */ a1of1 tmp[end+1]; // Connecting the first nodes to the input (l:N: tmp[l] = in[l]; ) /* array to hold the actual C-elments, either A2C or A3C */ [arbCount > 0 -> arbiter_handshake arbs[arbCount]; ] (h:arbCount:arbs[h].supply = supply;) /* Reset the variables we just stole lol */ i = 0; end = N-1; j = 0; pint arbIndex = 0; /* Invariant: i <= end */ *[ i != end -> /* * Invariant: tmp[i..end] has the current signals that need to be * combined together, and "isinv" specifies if they are the inverted * sense or not */ j = 0; *[ i <= end -> /*-- there are still signals that need to be combined --*/ j = j + 1; [ i = end -> /*-- last piece: pipe input through to next layer --*/ tmp[end+j] = tmp[i]; i = end+1; [] i+1 = end -> /*-- last piece: use either a 2 input C-element --*/ arbs[arbIndex].in1 = tmp[i]; arbs[arbIndex].in2 = tmp[i+1]; arbs[arbIndex].out = tmp[end+j]; arbIndex = arbIndex +1; i = end+1; [] else -> /*-- more to come; so use a two input C-element --*/ arbs[arbIndex].in1 = tmp[i]; arbs[arbIndex].in2 = tmp[i+1]; arbs[arbIndex].out = tmp[end+j]; arbIndex = arbIndex +1; i = i + 2; ] ] /*-- update range that has to be combined --*/ i = end+1; end = end+j; j = 0; ] out = tmp[end]; } // Generates the OR-trees required to go from // N one-hot inputs to Nc dual rail binary encoding. export template defproc dualrail_encoder(bool? in[N]; Mx1of2 out; power supply) { {N <= 1< ors_t[Nc]; ortree<_N/2> ors_f[Nc]; (i:Nc:ors_t[i].supply = supply; ors_t[i].out = out.d[i].t;) (i:Nc:ors_f[i].supply = supply; ors_f[i].out = out.d[i].f;) pint num_connected_t; // Number of guys already connected to the current OR tree pint num_connected_f; TIELO_X1 tielo(.vdd = supply.vdd, .vss = supply.vss); // I'm sorry pint bitval; (i:0..Nc-1: // For each output line num_connected_t = 0; num_connected_f = 0; (j:0.. _N-1: bitval = (j & ( 1 << i )) >> i; // Get binary digit of integer j, column i [bitval = 1 & j <= N-1-> ors_t[i].in[num_connected_t] = in[j]; num_connected_t = num_connected_t + 1; [] bitval = 0 & j <= N-1-> ors_f[i].in[num_connected_f] = in[j]; num_connected_f = num_connected_f + 1; [] bitval = 1 & j > N-1-> ors_t[i].in[num_connected_t] = tielo.y; num_connected_t = num_connected_t + 1; [] bitval = 0 & j > N-1-> ors_f[i].in[num_connected_f] = tielo.y; num_connected_f = num_connected_f + 1; ] ) ) } /** * Buffer function code. * Is the function block ripped from the buffer_s. * Used in the encoder2d. */ export template defproc buffer_s_func (Mx1of2 in; avMx1of2 out; bool? in_v, en, reset_B; power supply) { //function bool _out_a_BX_t[N],_out_a_BX_f[N],_out_a_B,_en_X_t[N],_en_X_f[N], _in_vX, _in_vXX_t[N],_in_vXX_f[N]; A_2C2N_RB_X4 f_buf_func[N]; A_2C2N_RB_X4 t_buf_func[N]; // reset buffers bool _reset_BX,_reset_BXX[N]; BUF_X1 reset_buf(.a=reset_B, .y=_reset_BX,.vdd=supply.vdd,.vss=supply.vss); sigbuf reset_bufarray(.in=_reset_BX, .out=_reset_BXX, .supply=supply); // Enable signal buffers sigbuf en_buf_t(.in=en, .out=_en_X_t, .supply=supply); sigbuf en_buf_f(.in=en, .out=_en_X_f, .supply=supply); // out ack signal buffers INV_X1 out_a_inv(.a=out.a,.y=_out_a_B, .vss = supply.vss, .vdd = supply.vdd); sigbuf out_a_B_buf_f(.in=_out_a_B,.out=_out_a_BX_t, .supply=supply); sigbuf out_a_B_buf_t(.in=_out_a_B,.out=_out_a_BX_f, .supply=supply); // in val signal buffers BUF_X4 in_v_prebuf(.a = in_v, .y = _in_vX, .vss = supply.vss, .vdd = supply.vdd); sigbuf in_v_buf_t(.in=_in_vX, .out=_in_vXX_t, .supply=supply); sigbuf in_v_buf_f(.in=_in_vX, .out=_in_vXX_f, .supply=supply); (i:N: f_buf_func[i].y=out.d.d[i].f; t_buf_func[i].y=out.d.d[i].t; f_buf_func[i].c1=_en_X_f[i]; t_buf_func[i].c1=_en_X_t[i]; f_buf_func[i].c2=_out_a_BX_f[i]; t_buf_func[i].c2=_out_a_BX_t[i]; f_buf_func[i].n1=in.d[i].f; t_buf_func[i].n1=in.d[i].t; f_buf_func[i].n2=_in_vXX_f[i]; t_buf_func[i].n2=_in_vXX_t[i]; f_buf_func[i].vdd=supply.vdd; t_buf_func[i].vdd=supply.vdd; f_buf_func[i].vss=supply.vss; t_buf_func[i].vss=supply.vss; t_buf_func[i].pr_B = _reset_BXX[i]; t_buf_func[i].sr_B = _reset_BXX[i]; f_buf_func[i].pr_B = _reset_BXX[i]; f_buf_func[i].sr_B = _reset_BXX[i]; ) } export template defproc encoder2d(a1of1 inx[Nx]; a1of1 iny[Ny]; avMx1of2<(NxC + NyC)> out; power supply; bool reset_B) { // Reset buffers pint H = 2*(NxC + NyC); //Reset strength? to be investigated bool _reset_BX,_reset_BXX[H]; BUF_X4 reset_buf(.a=reset_B, .y=_reset_BX,.vdd=supply.vdd,.vss=supply.vss); sigbuf<2*(NxC + NyC)> reset_bufarray(.in=_reset_BX, .out=_reset_BXX,.supply=supply); // Arbiters a1of1 _arb_out_x, _arb_out_y; a1of1 _x_temp[Nx],_y_temp[Ny]; // For wiring the reqs to the arbtrees (i:Nx: _x_temp[i].r = inx[i].r; ) (i:Ny: _y_temp[i].r = iny[i].r; ) arbtree Xarb(.in = _x_temp,.out = _arb_out_x,.supply = supply); arbtree Yarb(.in = _y_temp,.out = _arb_out_y,.supply = supply); // Sigbufs for strong ackowledge signals from arb_in's sigbuf_1output x_ack_arb[Nx]; sigbuf_1output y_ack_arb[Ny]; (i:Nx: x_ack_arb[i].in = _x_temp[i].a; x_ack_arb[i].out = inx[i].a; x_ack_arb[i].supply = supply; ) (i:Ny: y_ack_arb[i].in = _y_temp[i].a; y_ack_arb[i].out = iny[i].a; y_ack_arb[i].supply = supply; ) // This block checks that the input is valid and that the arbiter made a choice // Then activates the ack of the arbiter bool _x_v,_in_x_v,_in_y_v,_x_a_B,_x_a; A_2C2P_RB_X1 Y_ack_confirm(); Y_ack_confirm.p1 = _x_v; Y_ack_confirm.p2 =_in_x_v; Y_ack_confirm.c1 = _arb_out_y.r; Y_ack_confirm.c2 = _x_a_B; Y_ack_confirm.y = _arb_out_y.a; Y_ack_confirm.vdd = supply.vdd; Y_ack_confirm.vss = supply.vss; Y_ack_confirm.reset_B = _reset_BX; // This block checks that the input is valid and that the arbiter made a choice // Then activates the ack of the arbiter A_2C_RB_X1 X_ack_confirm(); X_ack_confirm.c1 = _arb_out_x.r; X_ack_confirm.c2 = _x_a_B; X_ack_confirm.vdd = supply.vdd; X_ack_confirm.vss = supply.vss; X_ack_confirm.pr_B = _reset_BX; X_ack_confirm.sr_B = _reset_BX; X_ack_confirm.y = _arb_out_x.a; // X_req ORtree bool _x_req_array[Nx], _x_v_B; (i:Nx:_x_req_array[i] = inx[i].r;) ortree x_req_ortree(.in = _x_req_array,.out = _x_v,.supply = supply); //todo BUFF INV_X1 not_x_req_ortree(.a = _x_v,.y = _x_v_B); //X_REQ validation // bool _x_req_array[Nx],_x_v_B, _en; // (i:Nx:_x_req_array[i] = x[i].r;) // ortree x_req_ortree(.in = _x_req_array,.out = _x_v,.supply = supply); // INV_X1 not_x_req_ortree(.a = _x_v,.y = _x_v_B); bool _en; A_1C3P2P2N_R_X1 x_ack(); // NEEDS BUFFERING TO X4 //branch1 x_ack.p4 = _in_x_v; x_ack.p5 = _x_v_B; //branch2 x_ack.p1 = _in_x_v; x_ack.p2 = _in_y_v; x_ack.p3 = _x_v; // x_ack.c1 = _en; x_ack.n1 = out.v; x_ack.n2 = _in_x_v; // x_ack.y = _x_a_B; // x_ack.vdd = supply.vdd; x_ack.vss = supply.vss; x_ack.pr_B = _reset_BX; x_ack.sr_B = _reset_BX; INV_X1 not_x_ack(.a = _x_a_B, .y = _x_a, .vdd = supply.vdd, .vss = supply.vss); A_1C2P_X1 enabling(.p1 = out.a, .p2 = out.v, .c1 = _x_a, .y = _en, .vdd = supply.vdd, .vss = supply.vss); avMx1of2<(NxC + NyC)> _in_x; // Encoders bool x_acks[Nx]; Mx1of2 x_enc_out; (i:Nx:x_acks[i] = inx[i].a;) dualrail_encoder x_encoder(.in = x_acks, .out = x_enc_out, .supply = supply); bool y_acks[Ny]; Mx1of2 y_enc_out; (i:Ny:y_acks[i] = iny[i].a;) dualrail_encoder y_encoder(.in = y_acks, .out = y_enc_out, .supply = supply); // Valid trees vtree vtree_x(.in = x_enc_out, .out = _in_x_v, .supply = supply); vtree vtree_y(.in = y_enc_out, .out = _in_y_v, .supply = supply); // Buffer func thing Mx1of2 into_buffer; (i:0..NxC-1:into_buffer.d[i] = x_enc_out.d[i];) (i:0..NyC-1:into_buffer.d[i+NxC] = y_enc_out.d[i];) AND2_X1 _in_xy_v(.a = _in_x_v, .b = _in_y_v, .vss = supply.vss, .vdd = supply.vdd); buffer_s_func buf_s_func(.in = into_buffer, .out = out, .en = _en, .in_v = _in_xy_v.y, .supply = supply, .reset_B = reset_B); } /** * Neuron handshaking. * Looks for a rising edge on the neuron req. * Then performs a 2d handshake out outy then outx. */ export defproc nrn_hs_2d(a1of1 in; a1of1 outx; a1of1 outy; power supply; bool reset_B) { bool _reset_BX; BUF_X2 reset_buf(.a = reset_B, .y = _reset_BX, .vdd = supply.vdd, .vss = supply.vss); bool _en, _req; // A_1C2N_RB_X1 A_ack(.c1 = _en, .n1 = _req, .n2 = in.r, .y = in.a, // .pr_B = _reset_BX, .sr_B = _reset_BX, .vss = supply.vss, .vdd = supply.vdd); // Switched it back // Because had the problem that if the req was not removed in time, // it would be recounted as a double spike, // since in.req is still high after the out has been dealt with. A_2C1N_RB_X1 A_ack(.c1 = _en, .c2 = in.r, .n1 = _req, .y = in.a, .pr_B = _reset_BX, .sr_B = _reset_BX, .vss = supply.vss, .vdd = supply.vdd); A_1C1P_X1 A_en(.p1 = _req, .c1 = in.a, .y = _en, .vss = supply.vss, .vdd = supply.vdd); bool _y_a_B, _x_a_B; INV_X2 inv_x(.a = outx.a, .y = _x_a_B, .vss = supply.vss, .vdd = supply.vdd); INV_X2 inv_y(.a = outy.a, .y = _y_a_B, .vss = supply.vss, .vdd = supply.vdd); A_2C1P1N_RB_X1 A_req(.p1 = _x_a_B, .c1 = _en, .c2 = _y_a_B, .n1 = in.r, .y = _req, .pr_B = _reset_BX, .sr_B = _reset_BX, .vdd = supply.vdd, .vss = supply.vss); // y_req pull up NAND2_X1 nand_y(.a = _y_a_B, .b = _req, .vdd = supply.vdd, .vss = supply.vss); A_1P_U_X4 pu_y(.a = nand_y.y, .y = outy.r, .vdd = supply.vdd, .vss = supply.vss); // x_req pull up NAND3_X1 nand_x(.a = _x_a_B, .b = _req, .c = outy.a, .vdd = supply.vdd, .vss = supply.vss); A_1P_U_X4 pu_x(.a = nand_x.y, .y = outx.r, .vdd = supply.vdd, .vss = supply.vss); } export defproc nrn_line_end_pull_down (bool? in; bool? reset_B; power supply; bool! out) { bool _out, __out, nand_out; BUF_X1 buf1(.a=in, .y=_out, .vdd=supply.vdd,.vss=supply.vss); BUF_X1 buf2(.a=_out, .y=__out, .vdd=supply.vdd,.vss=supply.vss); INV_X1 inv(.a = __out, .vdd=supply.vdd,.vss =supply.vss); NAND2_X1 aenor(.a=inv.y, .b=reset_B, .y = nand_out, .vdd=supply.vdd,.vss=supply.vss); A_1N_U_X4 pull_down(.a=nand_out, .y=out); } /** * A 2d grid of neuron handshakers. * Should then slot into the encoder. * Each neuron has an a1of1 channel (in), which is tripped when a neuron spikes. * N_dly is number of delay elements to add to line pull down, * for the purpose of running ACT sims. * It should probably be set to 0 though. */ export template defproc nrn_hs_2d_array(a1of1 in[Nx*Ny]; a1of1 outx[Nx], outy[Ny]; power supply; bool reset_B) { // Make hella signal buffers sigbuf rsbx(.in = reset_B, .supply = supply); sigbuf rsb[Ny]; // ResetSigBuf (j:Ny: rsb[j].in = rsbx.out[j]; rsb[j].supply = supply; ) // Add buffers on output req lines a1of1 _outx[Nx], _outy[Ny]; BUF_X4 out_req_buf_x[Nx]; (i:Nx: out_req_buf_x[i].vss = supply.vss; out_req_buf_x[i].vdd = supply.vdd; out_req_buf_x[i].a = _outx[i].r; out_req_buf_x[i].y = outx[i].r; ) BUF_X4 out_req_buf_y[Ny]; (i:Ny: out_req_buf_y[i].vss = supply.vss; out_req_buf_y[i].vdd = supply.vdd; out_req_buf_y[i].a = _outy[i].r; out_req_buf_y[i].y = outy[i].r; ) // Add buffers on output ack lines // Note that this should be generalised. // And probably won't even be done by ACT/innovus anwyay // TODO: do it properly with sigbufs? BUF_X4 out_ack_buf_x[Nx]; (i:Nx: out_ack_buf_x[i].vss = supply.vss; out_ack_buf_x[i].vdd = supply.vdd; out_ack_buf_x[i].a = outx[i].a; out_ack_buf_x[i].y = _outx[i].a; ) BUF_X4 out_ack_buf_y[Ny]; (i:Ny: out_ack_buf_y[i].vss = supply.vss; out_ack_buf_y[i].vdd = supply.vdd; out_ack_buf_y[i].a = outy[i].a; out_ack_buf_y[i].y = _outy[i].a; ) // Create handshake grid pint index; nrn_hs_2d neurons[Nx*Ny]; (i:0..Nx-1: (j:0..Ny-1: index = i + j*Nx; neurons[index].supply = supply; neurons[index].reset_B = rsb[j].out[i]; neurons[index].in = in[index]; neurons[index].outx = _outx[i]; neurons[index].outy = _outy[j]; ) ) // Create delay fifos to emulate the fact that the line pull downs // are at the end of the line, and thus slow. // Note that if N_dly = 0, delay fifo is just a pipe. delay_chain dly_x[Nx]; delay_chain dly_y[Ny]; // Create x line req pull downs nrn_line_end_pull_down pd_x[Nx]; sigbuf rsb_pd_x(.in = reset_B, .supply = supply); (i:0..Nx-1: dly_x[i].supply = supply; dly_x[i].in = _outx[i].a; pd_x[i].in = dly_x[i].out; pd_x[i].out = _outx[i].r; pd_x[i].reset_B = rsb_pd_x.out[i]; pd_x[i].supply = supply; ) // Create y line req pull downs nrn_line_end_pull_down pd_y[Ny]; sigbuf rsb_pd_y(.in = reset_B, .supply = supply); (j:0..Ny-1: dly_y[j].supply = supply; dly_y[j].in = _outy[j].a; pd_y[j].in = dly_y[j].out; pd_y[j].out = _outy[j].r; pd_y[j].reset_B = rsb_pd_y.out[j]; pd_y[j].supply = supply; ) // Add keeps KEEP_X1 keep_x[Nx]; (i:Nx: keep_x[i].vdd = supply.vdd; keep_x[i].vss = supply.vss; keep_x[i].y = _outx[i].r; ) KEEP_X1 keep_y[Ny]; (j:Ny: keep_y[j].vdd = supply.vdd; keep_y[j].vss = supply.vss; keep_y[j].y = _outy[j].r; ) } } }