/************************************************************************* * * This file is part of ACT dataflow neuro library * * Copyright (c) 2022 University of Groningen - Ole Richter * Copyright (c) 2022 University of Groningen - Michele Mastella * Copyright (c) 2022 University of Groningen - Hugh Greatorex * Copyright (c) 2022 University of Groningen - Madison Cotteret * * * This source describes Open Hardware and is licensed under the CERN-OHL-W v2 or later * * You may redistribute and modify this documentation and make products * using it under the terms of the CERN-OHL-W v2 (https:/cern.ch/cern-ohl). * This documentation is distributed WITHOUT ANY EXPRESS OR IMPLIED * WARRANTY, INCLUDING OF MERCHANTABILITY, SATISFACTORY QUALITY * AND FITNESS FOR A PARTICULAR PURPOSE. Please see the CERN-OHL-W v2 * for applicable conditions. * * Source location: https://git.web.rug.nl/bics/actlib_dataflow_neuro * * As per CERN-OHL-W v2 section 4.1, should You produce hardware based on * these sources, You must maintain the Source Location visible in its * documentation. * ************************************************************************** */ import "../../dataflow_neuro/cell_lib_async.act"; import "../../dataflow_neuro/cell_lib_std.act"; import "../../dataflow_neuro/treegates.act"; import "../../dataflow_neuro/primitives.act"; // import tmpl::dataflow_neuro; // import tmpl::dataflow_neuro; import std::channel; open std::channel; // import std::func; open std; import std::data; open std::data; // import dev::channel; // open dev::channel; namespace tmpl { namespace dataflow_neuro { /** * 2D decoder which uses a configurable delay from the VCtrees to buffer ack. * Nx is the x size of the decoder array * NxC is the number of wires in the x channel. * Thus NxC should be something like NxC = ceil(log2(Nx)) * but my guess is that we can't do logs... * N_dly_cfg is the number of config bits in the ACK delay line, * with all bits high corresponding to 2**N_dly_cfg -1 DLY4_X1 cells. */ export template defproc decoder_2d_dly (avMx1of2 in; bool? outx[Nx], outy[Ny], dly_cfg[N_dly_cfg], reset_B; power supply) { // Buffer to recieve concat(x,y) address packet buffer addr_buf(.in = in, .reset_B = reset_B, .supply = supply); // NEED TO BUFFER OUTPUTS FROM BUFFER I RECKON // Validity trees vtree vtree_x (.supply = supply); vtree vtree_y (.supply = supply); (i:0..NxC-1:vtree_x.in.d[i].t = addr_buf.out.d.d[i].t;) (i:0..NxC-1:vtree_x.in.d[i].f = addr_buf.out.d.d[i].f;) (i:0..NyC-1:vtree_y.in.d[i].t = addr_buf.out.d.d[i+NxC].t;) (i:0..NyC-1:vtree_y.in.d[i].f = addr_buf.out.d.d[i+NxC].f;) // Delay ack line. Ack line is delayed (but not the val) A_2C_B_X1 C2el(.c1 = vtree_x.out, .c2 = vtree_y.out, .vdd = supply.vdd, .vss = supply.vss); addr_buf.out.v = C2el.y; // delayprog dly(.in = tielow.y, .s = dly_cfg, .supply = supply); delayprog dly(.in = C2el.y, .s = dly_cfg, .supply = supply); // ACK MAY HAVE BEEN DISCONNECTED HERE // FOR TESTING PURPOSES // !!!!!!!!!!!!!!!! dly.out = addr_buf.out.a; // ACK MAY HAVE BEEN DISCONNECTED HERE // FOR TESTING PURPOSES // !!!!!!!!!!!!!!!! // AND trees pint bitval; andtree atree_x[Nx]; (k:0..Nx-1:atree_x[k].supply = supply;) (i:0..Nx-1: (j:0..NxC-1: bitval = (i & ( 1 << j )) >> j; // Get binary digit of integer i, column j [bitval = 1 -> atree_x[i].in[j] = addr_buf.out.d.d[j].t; []bitval = 0 -> atree_x[i].in[j] = addr_buf.out.d.d[j].f; []bitval >= 2 -> {false : "fuck"}; ] atree_x[i].out = outx[i]; ) ) andtree atree_y[Ny]; (k:0..Ny-1:atree_y[k].supply = supply;) (i:0..Ny-1: (j:0..NyC-1: bitval = (i & ( 1 << j )) >> j; // Get binary digit of integer i, column j [bitval = 1 -> atree_y[i].in[j] = addr_buf.out.d.d[j+NxC].t; []bitval = 0 -> atree_y[i].in[j] = addr_buf.out.d.d[j+NxC].f; ] atree_y[i].out = outy[i]; ) ) } /* * Build an arbiter_handshake tree. */ export template defproc arbtree (a1of1 in[N]; a1of1 out; power supply) { bool tout; { N > 0 : "What?" }; pint i, end, j; i = 0; end = N-1; pint arbCount; arbCount = 0; /* Pre"calculate" the number of C cells required, look below if confused */ *[ i != end -> j = 0; *[ i <= end -> j = j + 1; [i = end -> i = end+1; [] i+1 = end -> i = end+1; arbCount = arbCount +1; [] else -> i = i + 2; arbCount = arbCount +1; ] ] /*-- update range that has to be combined --*/ // i = end+1; end = end+j; ] /* array that holds ALL the nodes in the completion tree */ a1of1 tmp[end+1]; // Connecting the first nodes to the input (l:N: tmp[l] = in[l]; ) /* array to hold the actual C-elments, either A2C or A3C */ [arbCount > 0 -> arbiter_handshake arbs[arbCount]; ] (h:arbCount:arbs[h].supply = supply;) /* Reset the variables we just stole lol */ i = 0; end = N-1; j = 0; pint arbIndex = 0; /* Invariant: i <= end */ *[ i != end -> /* * Invariant: tmp[i..end] has the current signals that need to be * combined together, and "isinv" specifies if they are the inverted * sense or not */ j = 0; *[ i <= end -> /*-- there are still signals that need to be combined --*/ j = j + 1; [ i = end -> /*-- last piece: pipe input through to next layer --*/ tmp[end+j] = tmp[i]; i = end+1; [] i+1 = end -> /*-- last piece: use either a 2 input C-element --*/ arbs[arbIndex].in1 = tmp[i]; arbs[arbIndex].in2 = tmp[i+1]; arbs[arbIndex].out = tmp[end+j]; arbIndex = arbIndex +1; i = end+1; [] else -> /*-- more to come; so use a two input C-element --*/ arbs[arbIndex].in1 = tmp[i]; arbs[arbIndex].in2 = tmp[i+1]; arbs[arbIndex].out = tmp[end+j]; arbIndex = arbIndex +1; i = i + 2; ] ] /*-- update range that has to be combined --*/ i = end+1; end = end+j; j = 0; ] out = tmp[end]; } export template defproc and_grid(bool! out[Nx*Ny]; bool? inx[Nx], iny[Ny]; power supply) { AND2_X1 ands[Nx*Ny]; (i:0..Nx*Ny-1:ands[i].vss = supply.vss; ands[i].vdd = supply.vdd;) (x:0..Nx-1: (y:0..Ny-1: ands[x + y*Nx].a = inx[x]; ands[x + y*Nx].b = iny[y]; ands[x + y*Nx].y = out[x + y*Nx]; ) ) } // Generates the OR-trees required to go from // N one-hot inputs to Nc dual rail binary encoding. export template defproc dualrail_encoder(bool? in[N]; Mx1of2 out; power supply) { {N <= 1< ors_t[Nc]; ortree<_N/2> ors_f[Nc]; (i:Nc:ors_t[i].supply = supply; ors_t[i].out = out.d[i].t;) (i:Nc:ors_f[i].supply = supply; ors_f[i].out = out.d[i].f;) pint num_connected_t; // Number of guys already connected to the current OR tree pint num_connected_f; TIELO_X1 tielo(.vdd = supply.vdd, .vss = supply.vss); // I'm sorry pint bitval; (i:0..Nc-1: // For each output line num_connected_t = 0; num_connected_f = 0; (j:0.. _N-1: bitval = (j & ( 1 << i )) >> i; // Get binary digit of integer j, column i [bitval = 1 & j <= N-1-> ors_t[i].in[num_connected_t] = in[j]; num_connected_t = num_connected_t + 1; [] bitval = 0 & j <= N-1-> ors_f[i].in[num_connected_f] = in[j]; num_connected_f = num_connected_f + 1; [] bitval = 1 & j > N-1-> ors_t[i].in[num_connected_t] = tielo.y; num_connected_t = num_connected_t + 1; [] bitval = 0 & j > N-1-> ors_f[i].in[num_connected_f] = tielo.y; num_connected_f = num_connected_f + 1; ] ) ) } /** * Buffer function code. * Is the function block ripped from the buffer_s. * Used in the encoder2d. */ export template defproc buffer_s_func (Mx1of2 in; avMx1of2 out; bool? in_v, en, reset_B; power supply) { //function bool _out_a_BX_t[N],_out_a_BX_f[N],_out_a_B,_en_X_t[N],_en_X_f[N], _in_vX, _in_vXX_t[N],_in_vXX_f[N]; A_2C2N_RB_X4 f_buf_func[N]; A_2C2N_RB_X4 t_buf_func[N]; // reset buffers bool _reset_BX,_reset_BXX[N]; BUF_X1 reset_buf(.a=reset_B, .y=_reset_BX,.vdd=supply.vdd,.vss=supply.vss); sigbuf reset_bufarray(.in=_reset_BX, .out=_reset_BXX, .supply=supply); // Enable signal buffers sigbuf en_buf_t(.in=en, .out=_en_X_t, .supply=supply); sigbuf en_buf_f(.in=en, .out=_en_X_f, .supply=supply); // out ack signal buffers INV_X1 out_a_inv(.a=out.a,.y=_out_a_B, .vss = supply.vss, .vdd = supply.vdd); sigbuf out_a_B_buf_f(.in=_out_a_B,.out=_out_a_BX_t, .supply=supply); sigbuf out_a_B_buf_t(.in=_out_a_B,.out=_out_a_BX_f, .supply=supply); // in val signal buffers BUF_X4 in_v_prebuf(.a = in_v, .y = _in_vX, .vss = supply.vss, .vdd = supply.vdd); sigbuf in_v_buf_t(.in=_in_vX, .out=_in_vXX_t, .supply=supply); sigbuf in_v_buf_f(.in=_in_vX, .out=_in_vXX_f, .supply=supply); (i:N: f_buf_func[i].y=out.d.d[i].f; t_buf_func[i].y=out.d.d[i].t; f_buf_func[i].c1=_en_X_f[i]; t_buf_func[i].c1=_en_X_t[i]; f_buf_func[i].c2=_out_a_BX_f[i]; t_buf_func[i].c2=_out_a_BX_t[i]; f_buf_func[i].n1=in.d[i].f; t_buf_func[i].n1=in.d[i].t; f_buf_func[i].n2=_in_vXX_f[i]; t_buf_func[i].n2=_in_vXX_t[i]; f_buf_func[i].vdd=supply.vdd; t_buf_func[i].vdd=supply.vdd; f_buf_func[i].vss=supply.vss; t_buf_func[i].vss=supply.vss; t_buf_func[i].pr_B = _reset_BXX[i]; t_buf_func[i].sr_B = _reset_BXX[i]; f_buf_func[i].pr_B = _reset_BXX[i]; f_buf_func[i].sr_B = _reset_BXX[i]; ) } export template defproc encoder2D(a1of1 x[Nx]; a1of1 y[Ny]; avMx1of2<(NxC + NyC)> out; power supply; bool reset_B) { // Reset buffers pint H = 2*(NxC + NyC); //Reset strength? to be investigated bool _reset_BX,_reset_BXX[H]; BUF_X4 reset_buf(.a=reset_B, .y=_reset_BX,.vdd=supply.vdd,.vss=supply.vss); sigbuf<2*(NxC + NyC)> reset_bufarray(.in=_reset_BX, .out=_reset_BXX,.supply=supply); // Arbiters a1of1 _arb_out_x, _arb_out_y; a1of1 _x_temp[Nx],_y_temp[Ny]; // For wiring the reqs to the arbtrees (i:Nx: _x_temp[i].r = x[i].r; ) (i:Ny: _y_temp[i].r = y[i].r; ) arbtree Xarb(.in = _x_temp,.out = _arb_out_x,.supply = supply); arbtree Yarb(.in = _y_temp,.out = _arb_out_y,.supply = supply); // Sigbufs for strong ackowledge signals from arb_in's sigbuf_1output x_ack_arb[Nx]; sigbuf_1output y_ack_arb[Ny]; (i:Nx: x_ack_arb[i].in = _x_temp[i].a; x_ack_arb[i].out = x[i].a; x_ack_arb[i].supply = supply; ) (i:Ny: y_ack_arb[i].in = _y_temp[i].a; y_ack_arb[i].out = y[i].a; y_ack_arb[i].supply = supply; ) // This block checks that the input is valid and that the arbiter made a choice // Then activates the ack of the arbiter bool _x_v,_in_x_v,_in_y_v,_x_a_B,_x_a; A_2C2P_RB_X1 Y_ack_confirm(); Y_ack_confirm.p1 = _x_v; Y_ack_confirm.p2 =_in_x_v; Y_ack_confirm.c1 = _arb_out_y.r; Y_ack_confirm.c2 = _x_a_B; Y_ack_confirm.y = _arb_out_y.a; Y_ack_confirm.vdd = supply.vdd; Y_ack_confirm.vss = supply.vss; Y_ack_confirm.reset_B = _reset_BX; // This block checks that the input is valid and that the arbiter made a choice // Then activates the ack of the arbiter A_2C_RB_X1 X_ack_confirm(); X_ack_confirm.c1 = _arb_out_x.r; X_ack_confirm.c2 = _x_a_B; X_ack_confirm.vdd = supply.vdd; X_ack_confirm.vss = supply.vss; X_ack_confirm.pr_B = _reset_BX; X_ack_confirm.sr_B = _reset_BX; X_ack_confirm.y = _arb_out_x.a; // X_req ORtree bool _x_req_array[Nx], _x_v_B; (i:Nx:_x_req_array[i] = x[i].r;) ortree x_req_ortree(.in = _x_req_array,.out = _x_v,.supply = supply); //todo BUFF INV_X1 not_x_req_ortree(.a = _x_v,.y = _x_v_B); //X_REQ validation // bool _x_req_array[Nx],_x_v_B, _en; // (i:Nx:_x_req_array[i] = x[i].r;) // ortree x_req_ortree(.in = _x_req_array,.out = _x_v,.supply = supply); // INV_X1 not_x_req_ortree(.a = _x_v,.y = _x_v_B); bool _en; A_1C3P2P2N_R_X1 x_ack(); // NEEDS BUFFERING TO X4 //branch1 x_ack.p4 = _in_x_v; x_ack.p5 = _x_v_B; //branch2 x_ack.p1 = _in_x_v; x_ack.p2 = _in_y_v; x_ack.p3 = _x_v; // x_ack.c1 = _en; x_ack.n1 = out.v; x_ack.n2 = _in_x_v; // x_ack.y = _x_a_B; // x_ack.vdd = supply.vdd; x_ack.vss = supply.vss; x_ack.pr_B = _reset_BX; x_ack.sr_B = _reset_BX; INV_X1 not_x_ack(.a = _x_a_B, .y = _x_a, .vdd = supply.vdd, .vss = supply.vss); A_1C2P_X1 enabling(.p1 = out.a, .p2 = out.v, .c1 = _x_a, .y = _en, .vdd = supply.vdd, .vss = supply.vss); avMx1of2<(NxC + NyC)> _in_x; // Encoders bool x_acks[Nx]; Mx1of2 x_enc_out; (i:Nx:x_acks[i] = x[i].a;) dualrail_encoder x_encoder(.in = x_acks, .out = x_enc_out, .supply = supply); bool y_acks[Ny]; Mx1of2 y_enc_out; (i:Ny:y_acks[i] = y[i].a;) dualrail_encoder y_encoder(.in = y_acks, .out = y_enc_out, .supply = supply); // Valid trees vtree vtree_x(.in = x_enc_out, .out = _in_x_v, .supply = supply); vtree vtree_y(.in = y_enc_out, .out = _in_y_v, .supply = supply); // Buffer func thing Mx1of2 into_buffer; (i:0..NxC-1:into_buffer.d[i] = x_enc_out.d[i];) (i:0..NyC-1:into_buffer.d[i+NxC] = y_enc_out.d[i];) AND2_X1 _in_xy_v(.a = _in_x_v, .b = _in_y_v, .vss = supply.vss, .vdd = supply.vdd); buffer_s_func buf_s_func(.in = into_buffer, .out = out, .en = _en, .in_v = _in_xy_v.y, .supply = supply, .reset_B = reset_B); } /** * Neuron handshaking. * Looks for a rising edge on the neuron req. * Then performs a 2d handshake out outy then outx. */ export defproc neuron_hs_2D(a1of1 in; a1of1 outx; a1of1 outy; power supply; bool reset_B) { bool _reset_BX; BUF_X2 reset_buf(.a = reset_B, .y = _reset_BX, .vdd = supply.vdd, .vss = supply.vss); bool _en, _req; A_1C2N_RB_X1 A_ack(.c1 = _en, .n1 = _req, .n2 = in.r, .y = in.a, .pr_B = _reset_BX, .sr_B = _reset_BX, .vss = supply.vss, .vdd = supply.vdd); A_1C1P_X1 A_en(.p1 = _req, .c1 = in.a, .y = _en, .vss = supply.vss, .vdd = supply.vdd); bool _y_a_B, _x_a_B; INV_X2 inv_x(.a = outx.a, .y = _x_a_B, .vss = supply.vss, .vdd = supply.vdd); INV_X2 inv_y(.a = outy.a, .y = _y_a_B, .vss = supply.vss, .vdd = supply.vdd); A_2C1P1N_RB_X1 A_req(.p1 = _x_a_B, .c1 = _en, .c2 = _y_a_B, .n1 = in.r, .y = _req, .pr_B = _reset_BX, .sr_B = _reset_BX, .vdd = supply.vdd, .vss = supply.vss); // y_req pull up NAND2_X1 nand_y(.a = _y_a_B, .b = _req, .vdd = supply.vdd, .vss = supply.vss); PULLUP_X4 pu_y(.a = nand_y.y, .y = outy.r, .vdd = supply.vdd, .vss = supply.vss); // x_req pull up NAND3_X1 nand_x(.a = _x_a_B, .b = _req, .c = outy.a, .vdd = supply.vdd, .vss = supply.vss); PULLUP_X4 pu_x(.a = nand_x.y, .y = outx.r, .vdd = supply.vdd, .vss = supply.vss); } export defproc line_end_pull_down (bool? in; bool? reset_B; power supply; bool! out) { bool _out, __out, nand_out; BUF_X1 buf1(.a=in, .y=_out, .vdd=supply.vdd,.vss=supply.vss); BUF_X1 buf2(.a=_out, .y=__out, .vdd=supply.vdd,.vss=supply.vss); INV_X1 inv(.a = __out, .vdd=supply.vdd,.vss =supply.vss); NAND2_X1 aenor(.a=inv.y, .b=reset_B, .y = nand_out, .vdd=supply.vdd,.vss=supply.vss); PULLDOWN_X4 pull_down(.a=nand_out, .y=out); } /** * A 2d grid of neuron handshakers. * Should then slot into the encoder. * Each neuron has an a1of1 channel (in), which is tripped when a neuron spikes. * N_dly is number of delay elements to add to line pull down, * for the purpose of running ACT sims. * It should probably be set to 0 though. */ export template defproc nrn_hs_2D_array(a1of1 in[Nx*Ny]; a1of1 outx[Nx], outy[Ny]; power supply; bool reset_B) { // Make hella signal buffers sigbuf rsbx(.in = reset_B, .supply = supply); sigbuf rsb[Ny]; // ResetSigBuf (j:Ny: rsb[j].in = rsbx.out[j]; rsb[j].supply = supply; ) // Create handshake grid pint index; neuron_hs_2D neurons[Nx*Ny]; (i:0..Nx-1: (j:0..Ny-1: index = i + j*Nx; neurons[index].supply = supply; neurons[index].reset_B = rsb[j].out[i]; neurons[index].in = in[index]; neurons[index].outx = outx[i]; neurons[index].outy = outy[j]; ) ) // Hacks to maybe construct some fifos, ignore. [N_dly >= 1 -> delay_fifo dly_x[Nx]; delay_fifo dly_y[Ny]; ] // Create x line req pull downs line_end_pull_down pd_x[Nx]; sigbuf rsb_pd_x(.in = reset_B, .supply = supply); (i:0..Nx-1: [ N_dly = 0 -> pd_x[i].in = outx[i].a; [] N_dly >= 1 -> dly_x[i].supply = supply; dly_x[i].in = outx[i].a; pd_x[i].in = dly_x[i].out; ] pd_x[i].out = outx[i].r; pd_x[i].reset_B = rsb_pd_x.out[i]; pd_x[i].supply = supply; ) // Create y line req pull downs line_end_pull_down pd_y[Ny]; sigbuf rsb_pd_y(.in = reset_B, .supply = supply); (j:0..Ny-1: [ N_dly = 0 -> pd_y[j].in = outy[j].a; [] N_dly >= 1 -> dly_y[j].supply = supply; dly_y[j].in = outy[j].a; pd_y[j].in = dly_y[j].out; ] pd_y[j].out = outy[j].r; pd_y[j].reset_B = rsb_pd_y.out[j]; pd_y[j].supply = supply; ) // Add keeps KEEP_X1 keep_x[Nx]; (i:Nx: keep_x[i].vdd = supply.vdd; keep_x[i].vss = supply.vss; keep_x[i].y = outx[i].r; ) KEEP_X1 keep_y[Ny]; (j:Ny: keep_y[j].vdd = supply.vdd; keep_y[j].vss = supply.vss; keep_y[j].y = outy[j].r; ) } } }