From 2e4cdd5029b79816ebb46a4edff6d0f8ef83788b Mon Sep 17 00:00:00 2001 From: alexmadison Date: Thu, 31 Mar 2022 12:44:00 +0200 Subject: [PATCH] vastly improved lazy synapse handshakes --- dataflow_neuro/coders.act | 349 +++++++++++++++++--------------------- 1 file changed, 156 insertions(+), 193 deletions(-) diff --git a/dataflow_neuro/coders.act b/dataflow_neuro/coders.act index edf9b34..0468300 100644 --- a/dataflow_neuro/coders.act +++ b/dataflow_neuro/coders.act @@ -53,223 +53,186 @@ namespace tmpl { * Nc is the number of dualrail input channels. * Then builds N output AND gates, connecting to the right input wires. */ - export template - defproc decoder_dualrail (Mx1of2 in; bool? out[N]; power supply) { - // signal buffers - sigbuf in_tX[Nc]; - sigbuf in_fX[Nc]; - (i:Nc: - in_tX[i].supply = supply; - in_tX[i].in = in.d[i].t; +export template +defproc decoder_dualrail (Mx1of2 in; bool? out[N]; power supply) { + // signal buffers + sigbuf in_tX[Nc]; + sigbuf in_fX[Nc]; + (i:Nc: + in_tX[i].supply = supply; + in_tX[i].in = in.d[i].t; - in_fX[i].supply = supply; - in_fX[i].in = in.d[i].f; + in_fX[i].supply = supply; + in_fX[i].in = in.d[i].f; + ) + + // AND trees + pint bitval; + andtree atree[N]; + (k:0..N-1:atree[k].supply = supply;) + (i:0..N-1: + (j:0..Nc-1: + bitval = (i & ( 1 << j )) >> j; // Get binary digit of integer i, column j + [bitval = 1 -> + atree[i].in[j] = in_tX[j].out[i]; + // atree[i].in[j] = addr_buf.out.d.d[j].t; + []bitval = 0 -> + atree[i].in[j] = in_fX[j].out[i]; + // atree[i].in[j] = addr_buf.out.d.d[j].f; + []bitval >= 2 -> {false : "fuck"}; + ] + atree[i].out = out[i]; ) - - // AND trees - pint bitval; - andtree atree[N]; - (k:0..N-1:atree[k].supply = supply;) - (i:0..N-1: - (j:0..Nc-1: - bitval = (i & ( 1 << j )) >> j; // Get binary digit of integer i, column j - [bitval = 1 -> - atree[i].in[j] = in_tX[j].out[i]; - // atree[i].in[j] = addr_buf.out.d.d[j].t; - []bitval = 0 -> - atree[i].in[j] = in_fX[j].out[i]; - // atree[i].in[j] = addr_buf.out.d.d[j].f; - []bitval >= 2 -> {false : "fuck"}; - ] - atree[i].out = out[i]; - ) - ) - } + ) +} - /** - * 2D decoder which uses a configurable delay from the VCtrees to buffer ack. - * Nx is the x size of the decoder array - * NxC is the number of wires in the x channel. - * Thus NxC should be something like NxC = ceil(log2(Nx)) - * but my guess is that we can't do logs... - * N_dly_cfg is the number of config bits in the ACK delay line, - * with all bits high corresponding to 2**N_dly_cfg -1 DLY4_X1 cells. - */ - export template - defproc decoder_2d_dly (avMx1of2 in; bool? outx[Nx], outy[Ny], - dly_cfg[N_dly_cfg], reset_B; power supply) { +/** + * 2D decoder which uses a configurable delay from the VCtrees to buffer ack. + * Nx is the x size of the decoder array + * NxC is the number of wires in the x channel. + * Thus NxC should be something like NxC = ceil(log2(Nx)) + * but my guess is that we can't do logs... + * N_dly_cfg is the number of config bits in the ACK delay line, + * with all bits high corresponding to 2**N_dly_cfg -1 DLY4_X1 cells. + */ +export template +defproc decoder_2d_dly (avMx1of2 in; bool? outx[Nx], outy[Ny], + dly_cfg[N_dly_cfg], reset_B; power supply) { - // Buffer to recieve concat(x,y) address packet - buffer addr_buf(.in = in, .reset_B = reset_B, .supply = supply); + // Buffer to recieve concat(x,y) address packet + buffer addr_buf(.in = in, .reset_B = reset_B, .supply = supply); - // Validity trees - vtree vtree_x (.supply = supply); - vtree vtree_y (.supply = supply); - (i:0..NxC-1:vtree_x.in.d[i].t = addr_buf.out.d.d[i].t;) - (i:0..NxC-1:vtree_x.in.d[i].f = addr_buf.out.d.d[i].f;) - (i:0..NyC-1:vtree_y.in.d[i].t = addr_buf.out.d.d[i+NxC].t;) - (i:0..NyC-1:vtree_y.in.d[i].f = addr_buf.out.d.d[i+NxC].f;) + // Validity trees + vtree vtree_x (.supply = supply); + vtree vtree_y (.supply = supply); + (i:0..NxC-1:vtree_x.in.d[i].t = addr_buf.out.d.d[i].t;) + (i:0..NxC-1:vtree_x.in.d[i].f = addr_buf.out.d.d[i].f;) + (i:0..NyC-1:vtree_y.in.d[i].t = addr_buf.out.d.d[i+NxC].t;) + (i:0..NyC-1:vtree_y.in.d[i].f = addr_buf.out.d.d[i+NxC].f;) - // Delay ack line. Ack line is delayed (but not the val) - A_2C_B_X1 C2el(.c1 = vtree_x.out, .c2 = vtree_y.out, .vdd = supply.vdd, .vss = supply.vss); - addr_buf.out.v = C2el.y; + // Delay ack line. Ack line is delayed (but not the val) + A_2C_B_X1 C2el(.c1 = vtree_x.out, .c2 = vtree_y.out, .vdd = supply.vdd, .vss = supply.vss); + addr_buf.out.v = C2el.y; - // delayprog dly(.in = tielow.y, .s = dly_cfg, .supply = supply); - delayprog dly(.in = C2el.y, .s = dly_cfg, .supply = supply); - - // ACK MAY HAVE BEEN DISCONNECTED HERE - // FOR TESTING PURPOSES - // !!!!!!!!!!!!!!!! - dly.out = addr_buf.out.a; - // ACK MAY HAVE BEEN DISCONNECTED HERE - // FOR TESTING PURPOSES - // !!!!!!!!!!!!!!!! + // delayprog dly(.in = tielow.y, .s = dly_cfg, .supply = supply); + delayprog dly(.in = C2el.y, .s = dly_cfg, .supply = supply); + + // ACK MAY HAVE BEEN DISCONNECTED HERE + // FOR TESTING PURPOSES + // !!!!!!!!!!!!!!!! + dly.out = addr_buf.out.a; + // ACK MAY HAVE BEEN DISCONNECTED HERE + // FOR TESTING PURPOSES + // !!!!!!!!!!!!!!!! - // Decoder X/Y And trees - decoder_dualrail d_dr_x(.out = outx, .supply = supply); - (i:0..NxC-1:d_dr_x.in.d[i] = addr_buf.out.d.d[i];) + // Decoder X/Y And trees + decoder_dualrail d_dr_x(.out = outx, .supply = supply); + (i:0..NxC-1:d_dr_x.in.d[i] = addr_buf.out.d.d[i];) - decoder_dualrail d_dr_y(.out = outy, .supply = supply); - (i:0..NyC-1:d_dr_y.in.d[i] = addr_buf.out.d.d[i+NxC];) + decoder_dualrail d_dr_y(.out = outy, .supply = supply); + (i:0..NyC-1:d_dr_y.in.d[i] = addr_buf.out.d.d[i+NxC];) - } +} - export template - defproc and_grid(bool! out[Nx*Ny]; bool? inx[Nx], iny[Ny]; power supply) { - AND2_X1 ands[Nx*Ny]; - (i:0..Nx*Ny-1:ands[i].vss = supply.vss; ands[i].vdd = supply.vdd;) - (x:0..Nx-1: - (y:0..Ny-1: - ands[x + y*Nx].a = inx[x]; - ands[x + y*Nx].b = iny[y]; - ands[x + y*Nx].y = out[x + y*Nx]; - ) - ) - } - - - /** - * 2D decoder which uses synapse handshaking using line pulldowns. - * Nx is the x size of the decoder array - * NxC is the number of wires in the x channel. - * but my guess is that we can't do logs... - * the req on a1of1 out is the req to each synapse. - * The ack back from each line should go high when the synapse is charged. - * N_dly is a hard coded delay of the pull down circuit. - * It can be set to 0. - */ - export template - defproc decoder_2d_hs (avMx1of2 in; a1of1 out[Nx*Ny]; bool? reset_B; power supply) { - - // Buffer to recieve concat(x,y) address packet - buffer addr_buf(.in = in, .reset_B = reset_B, .supply = supply); - - // Decoder X/Y And trees - decoder_dualrail d_dr_x(.supply = supply); - (i:0..NxC-1:d_dr_x.in.d[i] = addr_buf.out.d.d[i];) - decoder_dualrail d_dr_y(.supply = supply); - (i:0..NyC-1:d_dr_y.in.d[i] = addr_buf.out.d.d[i+NxC];) - - // Validity - vtree vtree_x (.supply = supply); - vtree vtree_y (.supply = supply); - (i:0..NxC-1:vtree_x.in.d[i].t = addr_buf.out.d.d[i].t;) - (i:0..NxC-1:vtree_x.in.d[i].f = addr_buf.out.d.d[i].f;) - (i:0..NyC-1:vtree_y.in.d[i].t = addr_buf.out.d.d[i+NxC].t;) - (i:0..NyC-1:vtree_y.in.d[i].f = addr_buf.out.d.d[i+NxC].f;) - A_2C_B_X1 C2el(.c1 = vtree_x.out, .c2 = vtree_y.out, .y = addr_buf.out.v, - .vdd = supply.vdd, .vss = supply.vss); - - - // and grid for reqs into synapses - and_grid _and_grid(.inx = d_dr_x.out, .iny = d_dr_y.out, .supply = supply); - (i:Nx*Ny: out[i].r = _and_grid.out[i];) - - // Acknowledge pull down time - - // Pull DOWNs on the reqB lines by synapses (easier to invert). - bool _out_reqsB[Nx], _out_acksB[Nx]; // The vertical output ack lines from each syn. - PULLDOWN2_X4 req_pulldowns[Nx*Ny]; - pint index; - (i:Nx: - (j:Ny: - index = i + Nx*j; - req_pulldowns[index].a = out[index].a; - req_pulldowns[index].b = _out_acksB[i]; - req_pulldowns[index].y = _out_reqsB[i]; - req_pulldowns[index].vss = supply.vss; - req_pulldowns[index].vdd = supply.vdd; - ) +export template +defproc and_grid(bool! out[Nx*Ny]; bool? inx[Nx], iny[Ny]; power supply) { + AND2_X1 ands[Nx*Ny]; + (i:0..Nx*Ny-1:ands[i].vss = supply.vss; ands[i].vdd = supply.vdd;) + (x:0..Nx-1: + (y:0..Ny-1: + ands[x + y*Nx].a = inx[x]; + ands[x + y*Nx].b = iny[y]; + ands[x + y*Nx].y = out[x + y*Nx]; ) - - // ReqB keep cells - KEEP_X1 req_keeps[Nx]; - (i:Nx: - req_keeps[i].y = _out_reqsB[i]; - req_keeps[i].vdd = supply.vdd; - req_keeps[i].vss = supply.vss; - ) - - // req-ack buffers - // Delay needed here, since otherwise the pull up of reqB happens too quickly. - // Means that the pull up may start fighting the synapse, - // since the synapse has not yet retracted its ack. - // Also there is the possibility, if really fast, that the line pull up block - // doesn't yet see that the input is valid, and starts pulling up. - // In any case, this delay is important. - sigbuf req_bufs[Nx]; - delay_chain ack_delays[Nx]; - (i:Nx: - ack_delays[i].in = _out_reqsB[i]; - ack_delays[i].supply = supply; - - // req_bufs[i].in = _out_reqsB[i]; - req_bufs[i].in = ack_delays[i].out; - req_bufs[i].out[0] = _out_acksB[i]; // DANGER DANGER - req_bufs[i].supply = supply; + ) +} +/** + * 2D decoder which uses synapse handshaking using line pulldowns. + * Nx is the x size of the decoder array + * NxC is the number of wires in the x channel. + * but my guess is that we can't do logs... + * the req on a1of1 out is the req to each synapse. + * The ack back from each line should go high when the synapse is charged. + * N_dly is a hard coded delay of the pull down circuit. + * It can be set to 0. + */ +export template +defproc decoder_2d_hs (avMx1of2 in; a1of1 out[Nx*Ny]; bool? reset_B; power supply) { - ) + // Buffer to recieve concat(x,y) address packet + buffer addr_buf(.in = in, .reset_B = reset_B, .supply = supply); - // Line end pull UPs (triggered once synapse reqs removed) - OR2_X1 pu_ORs[Nx]; - PULLUP_X4 pu[Nx]; // TODO probably replace this with variable strength PU - AND2_X1 pu_ANDs[Nx]; - (i:Nx: - pu_ORs[i].a = _out_acksB[i]; - pu_ORs[i].b = d_dr_x.out[i]; - pu_ORs[i].vdd = supply.vdd; - pu_ORs[i].vss = supply.vss; + // Decoder X/Y And trees + decoder_dualrail d_dr_x(.supply = supply); + (i:0..NxC-1:d_dr_x.in.d[i] = addr_buf.out.d.d[i];) + decoder_dualrail d_dr_y(.supply = supply); + (i:0..NyC-1:d_dr_y.in.d[i] = addr_buf.out.d.d[i+NxC];) - pu_ANDs[i].a = pu_ORs[i].y; - pu_ANDs[i].b = reset_B; // TODO buffer - pu_ANDs[i].vdd = supply.vdd; - pu_ANDs[i].vss = supply.vss; + // Validity + vtree vtree_x (.supply = supply); + vtree vtree_y (.supply = supply); + (i:0..NxC-1:vtree_x.in.d[i].t = addr_buf.out.d.d[i].t;) + (i:0..NxC-1:vtree_x.in.d[i].f = addr_buf.out.d.d[i].f;) + (i:0..NyC-1:vtree_y.in.d[i].t = addr_buf.out.d.d[i+NxC].t;) + (i:0..NyC-1:vtree_y.in.d[i].f = addr_buf.out.d.d[i+NxC].f;) + A_2C_B_X1 C2el(.c1 = vtree_x.out, .c2 = vtree_y.out, .y = addr_buf.out.v, + .vdd = supply.vdd, .vss = supply.vss); - pu[i].a = pu_ANDs[i].y; - pu[i].y = _out_reqsB[i]; - pu[i].vdd = supply.vdd; - pu[i].vss = supply.vss; - ) - // ORtree from all output reqs, back to the buffer ack. - // This is instead of the ack that came from the delayed validity trees, - // in decoder_2d_dly. - ortree _ortree(.out = addr_buf.out.a, .supply = supply); - INV_X1 out_req_invs[Nx]; - (i:Nx: - out_req_invs[i].a = _out_reqsB[i]; - out_req_invs[i].vdd = supply.vdd; - out_req_invs[i].vss = supply.vss; + // and grid for reqs into synapses + and_grid _and_grid(.inx = d_dr_x.out, .iny = d_dr_y.out, .supply = supply); + (i:Nx*Ny: out[i].r = _and_grid.out[i];) - _ortree.in[i] = out_req_invs[i].y; - ) - } + // Acknowledge pull down time + // Pull DOWNs on the ackB lines by synapses (easier to invert). + bool _out_acksB[Nx]; // The vertical output ack lines from each syn. + PULLDOWN2_X4 ack_pulldowns[Nx*Ny]; + pint index; + (i:Nx: + (j:Ny: + index = i + Nx*j; + ack_pulldowns[index].a = out[index].a; + ack_pulldowns[index].b = d_dr_x.out[i]; + ack_pulldowns[index].y = _out_acksB[i]; + ack_pulldowns[index].vss = supply.vss; + ack_pulldowns[index].vdd = supply.vdd; + ) + ) + + // Line end pull UPs (triggered once reqs removed) + PULLUP_X4 pu[Nx]; // TODO probably replace this with variable strength PU + AND2_X1 pu_ANDs[Nx]; + (i:Nx: + pu_ANDs[i].a = d_dr_x.out[i]; + pu_ANDs[i].b = reset_B; // TODO buffer + pu_ANDs[i].vdd = supply.vdd; + pu_ANDs[i].vss = supply.vss; + + pu[i].a = pu_ANDs[i].y; + pu[i].y = _out_acksB[i]; + pu[i].vdd = supply.vdd; + pu[i].vss = supply.vss; + ) + + // ORtree from all output acks, back to the buffer ack. + // This is instead of the ack that came from the delayed validity trees, + // in decoder_2d_dly. + ortree _ortree(.out = addr_buf.out.a, .supply = supply); + INV_X1 out_ack_invs[Nx]; + (i:Nx: + out_ack_invs[i].a = _out_acksB[i]; + out_ack_invs[i].vdd = supply.vdd; + out_ack_invs[i].vss = supply.vss; + + _ortree.in[i] = out_ack_invs[i].y; + ) +}