actlib_dataflow_neuro/dataflow_neuro/coders.act

426 lines
13 KiB
Plaintext

/*************************************************************************
*
* This file is part of ACT dataflow neuro library
*
* Copyright (c) 2022 University of Groningen - Ole Richter
* Copyright (c) 2022 University of Groningen - Michele Mastella
* Copyright (c) 2022 University of Groningen - Hugh Greatorex
* Copyright (c) 2022 University of Groningen - Madison Cotteret
*
*
* This source describes Open Hardware and is licensed under the CERN-OHL-W v2 or later
*
* You may redistribute and modify this documentation and make products
* using it under the terms of the CERN-OHL-W v2 (https:/cern.ch/cern-ohl).
* This documentation is distributed WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY, INCLUDING OF MERCHANTABILITY, SATISFACTORY QUALITY
* AND FITNESS FOR A PARTICULAR PURPOSE. Please see the CERN-OHL-W v2
* for applicable conditions.
*
* Source location: https://git.web.rug.nl/bics/actlib_dataflow_neuro
*
* As per CERN-OHL-W v2 section 4.1, should You produce hardware based on
* these sources, You must maintain the Source Location visible in its
* documentation.
*
**************************************************************************
*/
import "../../dataflow_neuro/cell_lib_async.act";
import "../../dataflow_neuro/cell_lib_std.act";
import "../../dataflow_neuro/treegates.act";
import "../../dataflow_neuro/primitives.act";
// import tmpl::dataflow_neuro;
// import tmpl::dataflow_neuro;
import std::channel;
open std::channel;
import std::data;
open std::data;
// import dev::channel;
// open dev::channel;
namespace tmpl {
namespace dataflow_neuro {
/**
* 2D decoder which uses a configurable delay from the VCtrees to buffer ack.
* Nx is the x size of the decoder array
* NxC is the number of wires in the x channel.
* Thus NxC should be something like NxC = ceil(log2(Nx))
* but my guess is that we can't do logs...
* N_dly_cfg is the number of config bits in the ACK delay line,
* with all bits high corresponding to 2**N_dly_cfg -1 DLY4_X1 cells.
*/
export template<pint NxC, NyC, Nx, Ny, N_dly_cfg>
defproc decoder_2d_dly (avMx1of2<NxC+NyC> in; bool? outx[Nx], outy[Ny],
dly_cfg[N_dly_cfg], reset_B; power supply) {
// Buffer to recieve concat(x,y) address packet
buffer<NxC+NyC> addr_buf(.in = in, .reset_B = reset_B, .supply = supply);
// NEED TO BUFFER OUTPUTS FROM BUFFER I RECKON
// Validity trees
vtree<NxC> vtree_x (.supply = supply);
vtree<NyC> vtree_y (.supply = supply);
(i:0..NxC-1:vtree_x.in.d[i].t = addr_buf.out.d.d[i].t;)
(i:0..NxC-1:vtree_x.in.d[i].f = addr_buf.out.d.d[i].f;)
(i:0..NyC-1:vtree_y.in.d[i].t = addr_buf.out.d.d[i+NxC].t;)
(i:0..NyC-1:vtree_y.in.d[i].f = addr_buf.out.d.d[i+NxC].f;)
// Delay ack line. Ack line is delayed (but not the val)
A_2C_B_X1 C2el(.c1 = vtree_x.out, .c2 = vtree_y.out, .vdd = supply.vdd, .vss = supply.vss);
addr_buf.out.v = C2el.y;
// delayprog<N_dly_cfg> dly(.in = tielow.y, .s = dly_cfg, .supply = supply);
delayprog<N_dly_cfg> dly(.in = C2el.y, .s = dly_cfg, .supply = supply);
// ACK MAY HAVE BEEN DISCONNECTED HERE
// FOR TESTING PURPOSES
// !!!!!!!!!!!!!!!!
dly.out = addr_buf.out.a;
// ACK MAY HAVE BEEN DISCONNECTED HERE
// FOR TESTING PURPOSES
// !!!!!!!!!!!!!!!!
// AND trees
pint bitval;
andtree<NxC> atree_x[Nx];
(k:0..Nx-1:atree_x[k].supply = supply;)
(i:0..Nx-1:
(j:0..NxC-1:
bitval = (i & ( 1 << j )) >> j; // Get binary digit of integer i, column j
[bitval = 1 ->
atree_x[i].in[j] = addr_buf.out.d.d[j].t;
[]bitval = 0 ->
atree_x[i].in[j] = addr_buf.out.d.d[j].f;
[]bitval >= 2 -> {false : "fuck"};
]
atree_x[i].out = outx[i];
)
)
andtree<NyC> atree_y[Ny];
(k:0..Ny-1:atree_y[k].supply = supply;)
(i:0..Ny-1:
(j:0..NyC-1:
bitval = (i & ( 1 << j )) >> j; // Get binary digit of integer i, column j
[bitval = 1 ->
atree_y[i].in[j] = addr_buf.out.d.d[j+NxC].t;
[]bitval = 0 ->
atree_y[i].in[j] = addr_buf.out.d.d[j+NxC].f;
]
atree_y[i].out = outy[i];
)
)
}
/*
* Build an arbiter_handshake tree.
*/
export template<pint N>
defproc arbtree (a1of1 in[N]; a1of1 out; power supply)
{
bool tout;
{ N > 0 : "What?" };
pint i, end, j;
i = 0;
end = N-1;
pint arbCount;
arbCount = 0;
/* Pre"calculate" the number of C cells required, look below if confused */
*[ i != end ->
j = 0;
*[ i <= end ->
j = j + 1;
[i = end ->
i = end+1;
[] i+1 = end ->
i = end+1;
arbCount = arbCount +1;
[] else ->
i = i + 2;
arbCount = arbCount +1;
]
]
/*-- update range that has to be combined --*/
// i = end+1;
end = end+j;
]
/* array that holds ALL the nodes in the completion tree */
a1of1 tmp[end+1];
// Connecting the first nodes to the input
(l:N:
tmp[l] = in[l];
)
/* array to hold the actual C-elments, either A2C or A3C */
[arbCount > 0 ->
arbiter_handshake arbs[arbCount];
]
(h:arbCount:arbs[h].supply = supply;)
/* Reset the variables we just stole lol */
i = 0;
end = N-1;
j = 0;
pint arbIndex = 0;
/* Invariant: i <= end */
*[ i != end ->
/*
* Invariant: tmp[i..end] has the current signals that need to be
* combined together, and "isinv" specifies if they are the inverted
* sense or not
*/
j = 0;
*[ i <= end ->
/*-- there are still signals that need to be combined --*/
j = j + 1;
[ i = end ->
/*-- last piece: pipe input through to next layer --*/
tmp[end+j] = tmp[i];
i = end+1;
[] i+1 = end ->
/*-- last piece: use either a 2 input C-element --*/
arbs[arbIndex].in1 = tmp[i];
arbs[arbIndex].in2 = tmp[i+1];
arbs[arbIndex].out = tmp[end+j];
arbIndex = arbIndex +1;
i = end+1;
[] else ->
/*-- more to come; so use a two input C-element --*/
arbs[arbIndex].in1 = tmp[i];
arbs[arbIndex].in2 = tmp[i+1];
arbs[arbIndex].out = tmp[end+j];
arbIndex = arbIndex +1;
i = i + 2;
]
]
/*-- update range that has to be combined --*/
i = end+1;
end = end+j;
j = 0;
]
out = tmp[end];
}
export template<pint Nx, Ny>
defproc and_grid(bool! out[Nx*Ny]; bool? inx[Nx], iny[Ny]; power supply) {
AND2_X1 ands[Nx*Ny];
(i:0..Nx*Ny-1:ands[i].vss = supply.vss; ands[i].vdd = supply.vdd;)
(x:0..Nx-1:
(y:0..Ny-1:
ands[x + y*Nx].a = inx[x];
ands[x + y*Nx].b = iny[y];
ands[x + y*Nx].y = out[x + y*Nx];
)
)
}
// Generates the OR-trees required to go from
// N one-hot inputs to Nc dual rail binary encoding.
export template<pint Nc, N>
defproc dualrail_encoder(bool? in[N]; Mx1of2<Nc> out; power supply) {
{N <= 1<<Nc : "Num inputs too wide for encoding channel!"};
// For each output line, need to precalculate how big of an OR tree it needs
// since can't presume that N = 2**Nc
// First version however, just be hella lazy and presume N=2**Nc,
// connect extra nodes to ground (sorry)
pint _N; // N rounded up to a power of 2
_N = (1<<Nc);
ortree<_N/2> ors_t[Nc];
ortree<_N/2> ors_f[Nc];
(i:Nc:ors_t[i].supply = supply; ors_t[i].out = out.d[i].t;)
(i:Nc:ors_f[i].supply = supply; ors_f[i].out = out.d[i].f;)
pint num_connected_t; // Number of guys already connected to the current OR tree
pint num_connected_f;
TIELO_X1 tielo(.vdd = supply.vdd, .vss = supply.vss); // I'm sorry
pint bitval;
(i:0..Nc-1: // For each output line
num_connected_t = 0;
num_connected_f = 0;
(j:0.. _N-1:
bitval = (j & ( 1 << i )) >> i; // Get binary digit of integer j, column i
[bitval = 1 & j <= N-1->
ors_t[i].in[num_connected_t] = in[j];
num_connected_t = num_connected_t + 1;
[] bitval = 0 & j <= N-1->
ors_f[i].in[num_connected_f] = in[j];
num_connected_f = num_connected_f + 1;
[] bitval = 1 & j > N-1->
ors_t[i].in[num_connected_t] = tielo.y;
num_connected_t = num_connected_t + 1;
[] bitval = 0 & j > N-1->
ors_f[i].in[num_connected_f] = tielo.y;
num_connected_f = num_connected_f + 1;
]
)
)
}
export template<pint N, M, address_size, ACK_STRENGTH>
defproc encoder2D(a1of1 x[N]; a1of1 y[M] ;avMx1of2<address_size> addr; power supply; bool reset_B) {
// Reset buffers
pint H = 10; //Reset strength? to be investigated
bool _reset_BX,_reset_BXX[H];
BUF_X1 reset_buf(.a=reset_B, .y=_reset_BX,.vdd=supply.vdd,.vss=supply.vss);
sigbuf<2*address_size+3> reset_bufarray(.in=_reset_BX, .out=_reset_BXX,.supply=supply);
// Arbiters
a1of1 _out_arb_x,_out_arb_y;
a1of1 _x_temp[N],_y_temp[M];
(i:N:
_x_temp[i].r = x[i].r;
)
(i:M:
_y_temp[i].r = y[i].r;
)
arbtree<N> Xarb(.in = _x_temp,.out = _out_arb_x,.supply = supply);
arbtree<M> Yarb(.in = _y_temp,.out = _out_arb_y,.supply = supply);
// Sigbufs for strong ackowledge signals
sigbuf_1output<ACK_STRENGTH> x_ack_arb[N];
sigbuf_1output<ACK_STRENGTH> y_ack_arb[M];
(i:N:
x_ack_arb[i].in = _x_temp[i].a;
x_ack_arb[i].out = x[i].a;
x_ack_arb[i].supply = supply;
)
(i:M:
y_ack_arb[i].in = _y_temp[i].a;
y_ack_arb[i].out = y[i].a;
y_ack_arb[i].supply = supply;
)
// This block checks that the input is valid and that the arbiter made a choice
// Then activates the ack of the arbiter
bool _x_v,_in_x_v,_in_y_v,_x_a_B,_x_a;
A_2C2P_RB_X1 Y_ack_confirm();
Y_ack_confirm.p1 = _x_v;
Y_ack_confirm.p2 =_in_x_v;
Y_ack_confirm.c1 = _out_arb_y.r;
Y_ack_confirm.c2 = _x_a_B;
Y_ack_confirm.y = _out_arb_y.a;
Y_ack_confirm.vdd = supply.vdd;
Y_ack_confirm.vss = supply.vss;
Y_ack_confirm.reset_B = _reset_BX;
// This block checks that the input is valid and that the arbiter made a choice
// Then activates the ack of the arbiter
A_2C_RB_X1 X_ack_confirm();
X_ack_confirm.c1 = _out_arb_x.r;
X_ack_confirm.c2 = _x_a_B;
X_ack_confirm.vdd = supply.vdd;
X_ack_confirm.vss = supply.vss;
X_ack_confirm.pr_B = _reset_BX;
X_ack_confirm.sr_B = _reset_BX;
//X_REQ validation
bool _x_req_array[N],_x_v_B, _en;
(i:N:_x_req_array[i] = x[i].r;)
ortree x_req_ortree(.in = _x_req_array,.out = _x_v,.supply = supply);
INV_X1 not_x_req_ortree(.a = _x_v,.y = _x_v_B);
//
A_1C3P2P2N_R_X1 x_ack(); // NEEDS BUFFERING TO X4
//branch1
x_ack.p1 = _in_x_v;
x_ack.p2 = _x_v_B;
//branch2
x_ack.p3 = _in_x_v;
x_ack.p4 = _in_y_v;
x_ack.p5 = _x_v;
//
x_ack.c1 = _en;
x_ack.n1 = addr.v;
x_ack.n2 = _in_x_v;
//
x_ack.y = _x_a_B;
//
x_ack.vdd = supply.vdd;
x_ack.vss = supply.vss;
x_ack.pr_B = _reset_BX;
x_ack.sr_B = _reset_BX;
INV_X1 not_x_ack(.y = _x_a,.a = _x_a_B,.vdd = supply.vdd, .vss = supply.vss);
A_1C2P_X1 enabling(.p1 = addr.a, .p2 = addr.v, .c1 = _x_a, .y = _en, .vdd = supply.vdd, .vss = supply.vss);
avMx1of2<address_size> _in_x;
dualrail _in[N];
(i:N:_in_x.d.d[i] = _in[i];)
_in_x.v = _in_x_v;
//buffer_func_s
bool _en_X_t[address_size],_en_X_f[address_size],_out_a_BX_f[address_size],_out_a_BX_t[address_size];
bool _out_a_B;
A_2C2N_RB_X1 buffer_func_s_f[address_size];
A_2C2N_RB_X1 buffer_func_s_t[address_size];
sigbuf<address_size> en_buf_t(.in=_en, .out=_en_X_t, .supply=supply);
sigbuf<address_size> en_buf_f(.in=_en, .out=_en_X_f, .supply=supply);
INV_X1 out_a_inv(.a=addr.a,.y=_out_a_B);
sigbuf<address_size> out_a_B_buf_f(.in=_out_a_B,.out=_out_a_BX_t, .supply=supply);
sigbuf<address_size> out_a_B_buf_t(.in=_out_a_B,.out=_out_a_BX_f, .supply=supply);
(i:address_size:
buffer_func_s_f[i].c1 = _en_X_f[i];
buffer_func_s_f[i].c2 = _out_a_BX_f[i];
buffer_func_s_f[i].n1 = _in_x.d.d[i].f;
buffer_func_s_f[i].n1 = _in_x.v;
buffer_func_s_f[i].vdd=supply.vdd;
buffer_func_s_f[i].vss=supply.vss;
buffer_func_s_f[i].pr_B = _reset_BXX[i+3];
buffer_func_s_f[i].sr_B = _reset_BXX[i+3];
buffer_func_s_f[i].y = addr.d.d[i].f;
buffer_func_s_t[i].c1 = _en_X_t[i];
buffer_func_s_t[i].c2 = _out_a_BX_t[i];
buffer_func_s_t[i].n1 = _in_x.d.d[i].t;
buffer_func_s_t[i].n1 = _in_x.v;
buffer_func_s_t[i].vdd=supply.vdd;
buffer_func_s_t[i].vss=supply.vss;
buffer_func_s_t[i].pr_B = _reset_BXX[i+3+address_size];
buffer_func_s_t[i].sr_B = _reset_BXX[i+3+address_size];
buffer_func_s_t[i].y = addr.d.d[i].t;
)
bool _addr_v;
Mx1of2<address_size> addr_temp;
(i:address_size:addr_temp.d[i] = addr.d.d[i];)
vtree<address_size> addr_validity(.in = addr_temp,.out = _addr_v);
sigbuf_1output<4> addr_validity_x(.in = _addr_v,.out = addr.v);
addr_validity.supply = supply;
addr_validity_x.supply = supply;
}
}
}