/*************************************************************************
 *
 *  This file is part of ACT dataflow neuro library
 *
 *  Copyright (c) 2022 University of Groningen - Ole Richter
 *  Copyright (c) 2022 University of Groningen - Michele Mastella
 *  Copyright (c) 2022 University of Groningen - Hugh Greatorex
 *  Copyright (c) 2022 University of Groningen - Madison Cotteret
 * 
 *
 *  This source describes Open Hardware and is licensed under the CERN-OHL-W v2 or later
 *
 *  You may redistribute and modify this documentation and make products
 *  using it under the terms of the CERN-OHL-W v2 (https:/cern.ch/cern-ohl).
 *  This documentation is distributed WITHOUT ANY EXPRESS OR IMPLIED
 *  WARRANTY, INCLUDING OF MERCHANTABILITY, SATISFACTORY QUALITY
 *  AND FITNESS FOR A PARTICULAR PURPOSE. Please see the CERN-OHL-W v2
 *  for applicable conditions.
 *
 *  Source location: https://git.web.rug.nl/bics/actlib_dataflow_neuro
 *
 *  As per CERN-OHL-W v2 section 4.1, should You produce hardware based on
 *  these sources, You must maintain the Source Location visible in its
 *  documentation.
 *
 **************************************************************************
 */
import "../../dataflow_neuro/cell_lib_async.act";
import "../../dataflow_neuro/cell_lib_std.act";
import "../../dataflow_neuro/treegates.act";
import "../../dataflow_neuro/primitives.act";
// import tmpl::dataflow_neuro;
// import tmpl::dataflow_neuro;
import std::channel;
open std::channel;

// import std::func;
open std;

import std::data;
open std::data;


// import dev::channel;
// open dev::channel;


namespace tmpl {
	namespace dataflow_neuro {

/**
 * Dualrail decoder.
 * Nc is the number of dualrail input channels.
 * Then builds N output AND gates, connecting to the right input wires.
 */
export template<pint Nc, N>
defproc decoder_dualrail (Mx1of2<Nc> in; bool? out[N]; power supply) {
	// signal buffers
	sigbuf<N> in_tX[Nc];
	sigbuf<N> in_fX[Nc];	
	(i:Nc:
		in_tX[i].supply = supply;
		in_tX[i].in = in.d[i].t;

		in_fX[i].supply = supply;
		in_fX[i].in = in.d[i].f;
	)

	// AND trees
	pint bitval;
	andtree<Nc> atree[N];
	(k:0..N-1:atree[k].supply = supply;)
	(i:0..N-1:
		(j:0..Nc-1:
			bitval = (i & ( 1 << j )) >> j; // Get binary digit of integer i, column j
			[bitval = 1 ->
				atree[i].in[j] = in_tX[j].out[i];
				// atree[i].in[j] = addr_buf.out.d.d[j].t;
				[]bitval = 0 ->
				atree[i].in[j] = in_fX[j].out[i];
				// atree[i].in[j] = addr_buf.out.d.d[j].f;
				[]bitval >= 2 -> {false : "fuck"};
				]
			atree[i].out = out[i];
			)
		)
}

/**
 * Dualrail decoder with buffered outputs.
 * Be careful of out[] indexing.
 */
export template<pint Nc, N, OUT_STRENGTH>
defproc decoder_dualrailX(Mx1of2<Nc> in; bool? out[N*OUT_STRENGTH]; power supply) {
	decoder_dualrail<Nc, N> decoder(.in = in, .supply = supply);
	sigbuf<OUT_STRENGTH> sb[N];
	(i:N:
		sb[i].in = decoder.out[i];
		sb[i].supply = supply;
		(j:OUT_STRENGTH:
			sb[i].out[j] = out[j + i*OUT_STRENGTH];
		)
	)
}


/**
 * 2D decoder which uses a configurable delay from the VCtrees to buffer ack.
 *	Nx is the x size of the decoder array
 * 	NxC is the number of wires in the x channel.
 * 	Thus NxC should be something like NxC = ceil(log2(Nx))
 * 	but my guess is that we can't do logs...
 * 	N_dly_cfg is the number of config bits in the ACK delay line,
 * 	with all bits high corresponding to 2**N_dly_cfg -1 DLY4_X1 cells.
 */
export template<pint NxC, NyC, Nx, Ny, N_dly_cfg>
defproc decoder_2d_dly (avMx1of2<NxC+NyC> in; bool? outx[Nx], outy[Ny], 
	dly_cfg[N_dly_cfg], reset_B; power supply) {

	// Buffer to recieve concat(x,y) address packet
	buffer<NxC+NyC> addr_buf(.in = in, .reset_B = reset_B, .supply = supply);

	// Validity trees
	vtree<NxC> vtree_x (.supply = supply);
	vtree<NyC> vtree_y (.supply = supply);
	(i:0..NxC-1:vtree_x.in.d[i].t = addr_buf.out.d.d[i].t;)
	(i:0..NxC-1:vtree_x.in.d[i].f = addr_buf.out.d.d[i].f;)
	(i:0..NyC-1:vtree_y.in.d[i].t = addr_buf.out.d.d[i+NxC].t;)
	(i:0..NyC-1:vtree_y.in.d[i].f = addr_buf.out.d.d[i+NxC].f;)


	// Delay ack line. Ack line is delayed (but not the val)
	A_2C_B_X1 C2el(.c1 = vtree_x.out, .c2 = vtree_y.out, .vdd = supply.vdd, .vss = supply.vss);
	addr_buf.out.v = C2el.y;

	delayprog<N_dly_cfg> dly(.in = C2el.y, .s = dly_cfg, .supply = supply);
	dly.out = addr_buf.out.a;

	// Decoder X/Y And trees
	decoder_dualrail<NxC,Nx> d_dr_x(.out = outx, .supply = supply);
	(i:0..NxC-1:d_dr_x.in.d[i] = addr_buf.out.d.d[i];)

	decoder_dualrail<NyC,Ny> d_dr_y(.out = outy, .supply = supply);
	(i:0..NyC-1:d_dr_y.in.d[i] = addr_buf.out.d.d[i+NxC];)

}

export template<pint Nx, Ny>
defproc and_grid(bool! out[Nx*Ny]; bool? inx[Nx], iny[Ny]; power supply) {
	// Buffer inputs
	sigbuf<Ny> xbuf[Nx];
	sigbuf<Nx> ybuf[Ny];
	(i:Nx:
		xbuf[i].in = inx[i];
		xbuf[i].supply = supply;
	)
	(i:Ny:
		ybuf[i].in = iny[i];
		ybuf[i].supply = supply;
	)

	AND2_X1 ands[Nx*Ny];
	(i:0..Nx*Ny-1:ands[i].vss = supply.vss; ands[i].vdd = supply.vdd;)
	(x:0..Nx-1:
		(y:0..Ny-1:
			ands[x + y*Nx].a = xbuf[x].out[y];
			ands[x + y*Nx].b = ybuf[y].out[x];
			ands[x + y*Nx].y = out[x + y*Nx];
			)
		)
}


/**
 * 2D decoder which uses synapse handshaking using line pulldowns.
 *	Nx is the x size of the decoder array
 * 	NxC is the number of wires in the x channel.
 * 	but my guess is that we can't do logs...
 * 	the req on a1of1 out is the req to each synapse.
 * 	The ack back from each line should go high when the synapse is charged.
 *  N_dly is a hard coded delay of the pull down circuit.
 * 	It can be set to 0.
 */
export template<pint NxC, NyC, Nx, Ny>
defproc decoder_2d_hs (avMx1of2<NxC+NyC> in; a1of1 out[Nx*Ny]; bool? reset_B; power supply) {

	// Buffer to recieve concat(x,y) address packet
	buffer<NxC+NyC> addr_buf(.in = in, .reset_B = reset_B, .supply = supply);

	// Decoder X/Y And trees
	decoder_dualrail<NxC,Nx> d_dr_x(.supply = supply);
	(i:0..NxC-1:d_dr_x.in.d[i] = addr_buf.out.d.d[i];)
	decoder_dualrail<NyC,Ny> d_dr_y(.supply = supply);
	(i:0..NyC-1:d_dr_y.in.d[i] = addr_buf.out.d.d[i+NxC];)

	// sig buf for reqx lines, since they go to synapse pull down gates.
	sigbuf<Ny+1> d_dr_xX[Nx];
	(i:Nx:
		d_dr_xX[i].in = d_dr_x.out[i];
		d_dr_xX[i].supply = supply;
	)

	// Validity 
	vtree<NxC> vtree_x (.supply = supply);
	vtree<NyC> vtree_y (.supply = supply);
	(i:0..NxC-1:vtree_x.in.d[i].t = addr_buf.out.d.d[i].t;)
	(i:0..NxC-1:vtree_x.in.d[i].f = addr_buf.out.d.d[i].f;)
	(i:0..NyC-1:vtree_y.in.d[i].t = addr_buf.out.d.d[i+NxC].t;)
	(i:0..NyC-1:vtree_y.in.d[i].f = addr_buf.out.d.d[i+NxC].f;)
	A_2C_B_X1 valid_Cel(.c1 = vtree_x.out, .c2 = vtree_y.out, .y = addr_buf.out.v,
		.vdd = supply.vdd, .vss = supply.vss);


	// and grid for reqs into synapses
	and_grid<Nx, Ny> _and_grid(.inx = d_dr_x.out, .iny = d_dr_y.out, .supply = supply);
	(i:Nx*Ny: out[i].r = _and_grid.out[i];)

	// Acknowledge pull down time

	// Pull DOWNs on the ackB lines by synapses (easier to invert).
	bool _out_acksB[Nx]; // The vertical output ack lines from each syn.
	A_2N_U_X4 ack_pulldowns[Nx*Ny];
	pint index;
	(i:Nx:
		(j:Ny:
			index = i + Nx*j;
			ack_pulldowns[index].a = out[index].a;
			ack_pulldowns[index].b = d_dr_xX[i].out[j];
			ack_pulldowns[index].y = _out_acksB[i];
			ack_pulldowns[index].vss = supply.vss;
			ack_pulldowns[index].vdd = supply.vdd;
		)
	)

	// Line end pull UPs (triggered once reqs removed)
	// Use two pullups rather than and-pullup 
	// bc smaller
	// and bc the delay that an AND induces means that the pullup could
	// end up fighting a synapse pulldown, as both have the correct req sigs.
	A_1P_U_X4 pu[Nx]; // TODO probably replace this with variable strength PU
	A_1P_U_X4 pu_reset[Nx];
	(i:Nx:
		pu[i].a = d_dr_xX[i].out[Ny];
		pu[i].y = _out_acksB[i];
		pu[i].vdd = supply.vdd;
		pu[i].vss = supply.vss;

		pu_reset[i].a = reset_B;
		pu_reset[i].y = _out_acksB[i];
		pu_reset[i].vdd = supply.vdd;
		pu_reset[i].vss = supply.vss;
	)

	// ORtree from all output acks, back to the buffer ack.
	// This is instead of the ack that came from the delayed validity trees,
	// in decoder_2d_dly.
	ortree<Nx>  _ortree(.supply = supply);
	INV_X1 out_ack_invs[Nx];
	(i:Nx:
		out_ack_invs[i].a = _out_acksB[i];
		out_ack_invs[i].vdd = supply.vdd;
		out_ack_invs[i].vss = supply.vss;

		_ortree.in[i] = out_ack_invs[i].y;
	)

	// C element to ensure that the buffer receives an invalid
	// _only_ once _both_ ackB has been reset, _and_ its output data
	// has been fully invalidated.
	// Otherwise run into the issue that ack is removed before data is invalid.
	A_2C_B_X1 buf_ack_Cel(.c1 = _ortree.out, .c2 = valid_Cel.y, .y = addr_buf.out.a,
		.vdd = supply.vdd, .vss = supply.vss);

}

/**
 * 2D decoder which uses either synapse handshaking, or just a delay.
 * Controlled by the "hs_en" (handshake_enable) config bit.
 * hs_en = 0 -> use delayed version.
 * hs_en = 1 -> use synapse handshaking.
 * Regardless of which version is used, the final ack going to the buffer
 * goes through the prog_delay block.
 * Thus, for the handshaking version to be used "correctly", 
 * dly_cfg should be set to all zeros.
 */
export template<pint NxC, NyC, Nx, Ny, N_dly_cfg>
defproc decoder_2d_hybrid (avMx1of2<NxC+NyC> in; a1of1 out[Nx*Ny]; bool? dly_cfg[N_dly_cfg], hs_en,
	reset_B; power supply) {

	bool hs_enB;
	INV_X4 hs_inv(.a = hs_en, .y = hs_enB, .vdd = supply.vdd, .vss = supply.vss);

	// Buffer to recieve concat(x,y) address packet
	buffer<NxC+NyC> addr_buf(.in = in, .reset_B = reset_B, .supply = supply);

	// Decoder X/Y And trees
	decoder_dualrail<NxC,Nx> d_dr_x(.supply = supply);
	(i:0..NxC-1:d_dr_x.in.d[i] = addr_buf.out.d.d[i];)
	decoder_dualrail<NyC,Ny> d_dr_y(.supply = supply);
	(i:0..NyC-1:d_dr_y.in.d[i] = addr_buf.out.d.d[i+NxC];)

	// sig buf for reqx lines, since they go to synapse pull down gates.
	sigbuf<Ny+1> d_dr_xX[Nx];
	(i:Nx:
		d_dr_xX[i].in = d_dr_x.out[i];
		d_dr_xX[i].supply = supply;
	)

	// Validity 
	vtree<NxC> vtree_x (.supply = supply);
	vtree<NyC> vtree_y (.supply = supply);
	(i:0..NxC-1:vtree_x.in.d[i].t = addr_buf.out.d.d[i].t;)
	(i:0..NxC-1:vtree_x.in.d[i].f = addr_buf.out.d.d[i].f;)
	(i:0..NyC-1:vtree_y.in.d[i].t = addr_buf.out.d.d[i+NxC].t;)
	(i:0..NyC-1:vtree_y.in.d[i].f = addr_buf.out.d.d[i+NxC].f;)
	A_2C_B_X1 valid_Cel(.c1 = vtree_x.out, .c2 = vtree_y.out, .y = addr_buf.out.v,
		.vdd = supply.vdd, .vss = supply.vss);


	// and grid for reqs into synapses
	and_grid<Nx, Ny> _and_grid(.inx = d_dr_x.out, .iny = d_dr_y.out, .supply = supply);
	(i:Nx*Ny: out[i].r = _and_grid.out[i];)

	// Acknowledge pull down time

	// Pull DOWNs on the ackB lines by synapses (easier to invert).
	bool _out_acksB[Nx]; // The vertical output ack lines from each syn.
	A_2N_U_X4 ack_pulldowns[Nx*Ny];
	pint index;
	(i:Nx:
		(j:Ny:
			index = i + Nx*j;
			ack_pulldowns[index].a = out[index].a;
			ack_pulldowns[index].b = d_dr_xX[i].out[j];
			ack_pulldowns[index].y = _out_acksB[i];
			ack_pulldowns[index].vss = supply.vss;
			ack_pulldowns[index].vdd = supply.vdd;
		)
	)

	// Line end pull UPs (triggered once reqs removed)
	// Use two pullups rather than and-pullup 
	// bc smaller
	// and bc the delay that an AND induces means that the pullup could
	// end up fighting a synapse pulldown, as both have the correct req sigs.
	A_2P_U_X4 pu[Nx]; // TODO probably replace this with variable strength PU
	A_1P_U_X4 pu_reset[Nx];
	(i:Nx:
		pu[i].a = d_dr_xX[i].out[Ny];
		pu[i].b = hs_enB;
		pu[i].y = _out_acksB[i];
		pu[i].vdd = supply.vdd;
		pu[i].vss = supply.vss;

		pu_reset[i].a = reset_B;
		pu_reset[i].y = _out_acksB[i];
		pu_reset[i].vdd = supply.vdd;
		pu_reset[i].vss = supply.vss;
	)

	// Add keeps (currently don't do anything in ACT)
	KEEP_X1 keeps[Nx];
	(i:Nx:
		keeps[i].vdd = supply.vdd;
		keeps[i].vss = supply.vss;
		keeps[i].y = _out_acksB[i];
	)

	// ORtree from all output acks, back to the buffer ack.
	// This is instead of the ack that came from the delayed validity trees,
	// in decoder_2d_dly.
	ortree<Nx>  _ortree(.supply = supply);
	INV_X1 out_ack_invs[Nx];
	(i:Nx:
		out_ack_invs[i].a = _out_acksB[i];
		out_ack_invs[i].vdd = supply.vdd;
		out_ack_invs[i].vss = supply.vss;

		_ortree.in[i] = out_ack_invs[i].y;
	)

	// C element to ensure that the buffer receives an invalid
	// _only_ once _both_ ackB has been reset, _and_ its output data
	// has been fully invalidated.
	// Otherwise run into the issue that ack is removed before data is invalid.
	A_2C_B_X1 buf_ack_Cel(.c1 = _ortree.out, .c2 = valid_Cel.y,
		.vdd = supply.vdd, .vss = supply.vss);

	// Mux to switch between acks from handshake or delay
	MUX2_X1 ack_mux(.s = hs_en, .a = valid_Cel.y, .b = buf_ack_Cel.y,
		.vdd = supply.vdd, .vss = supply.vss);

	// Programmable delay
	delayprog<N_dly_cfg> dly(.in = ack_mux.y, .out = addr_buf.out.a, .s = dly_cfg, .supply = supply);

}


/*
 * Build an arbiter_handshake tree.
 */
		export template<pint N>
		defproc arbtree (a1of1 in[N]; a1of1 out; power supply)
		{
			bool tout;

			{ N > 0 : "What?" };

			pint i, end, j;
			i = 0;
			end = N-1;

			pint arbCount;
			arbCount = 0;
	/* Pre"calculate" the number of C cells required, look below if confused */
			*[ i != end ->
				j = 0;
				*[ i <= end ->
					j = j + 1;
					[i = end ->
						i = end+1;
						[] i+1 = end ->
						i = end+1;
						arbCount = arbCount +1;
						[] else ->
						i = i + 2;
						arbCount = arbCount +1;
						]
					]
			/*-- update range that has to be combined --*/
			// i = end+1;
				end = end+j;
				]

	/* array that holds ALL the nodes in the completion tree */
			a1of1 tmp[end+1];

	// Connecting the first nodes to the input
			(l:N:
				tmp[l] = in[l];
				)

	/* array to hold the actual C-elments, either A2C or A3C */
			[arbCount > 0 ->
				arbiter_handshake arbs[arbCount];
				]
			(h:arbCount:arbs[h].supply = supply;)

	/* Reset the variables we just stole lol */
			i = 0;
			end = N-1;
			j = 0;
			pint arbIndex = 0;

	/* Invariant: i <= end */

			*[ i != end ->
		 /* 
			* Invariant: tmp[i..end] has the current signals that need to be
			* combined together, and "isinv" specifies if they are the inverted
			* sense or not
			*/
				j = 0;
				*[ i <= end ->
				/*-- there are still signals that need to be combined --*/
					j = j + 1;
					[ i = end ->
				/*-- last piece: pipe input through to next layer --*/
						tmp[end+j] = tmp[i];
						i = end+1;
						[] i+1 = end ->
				/*-- last piece: use either a 2 input C-element --*/
						arbs[arbIndex].in1 = tmp[i];
						arbs[arbIndex].in2 = tmp[i+1];
						arbs[arbIndex].out = tmp[end+j];
						arbIndex = arbIndex +1;
						i = end+1;
						[] else ->
				/*-- more to come; so use a two input C-element --*/
						arbs[arbIndex].in1 = tmp[i];
						arbs[arbIndex].in2 = tmp[i+1];
						arbs[arbIndex].out = tmp[end+j];
						arbIndex = arbIndex +1;
						i = i + 2;
						]
					]
				/*-- update range that has to be combined --*/
				i = end+1;
				end = end+j;
				j = 0;
				]

			out = tmp[end];

		}

		// Generates the OR-trees required to go from 
		// N one-hot inputs to Nc dual rail binary encoding.
		export template<pint Nc, N>
		defproc dualrail_encoder(bool? in[N]; Mx1of2<Nc> out; power supply) {
			{N <= 1<<Nc : "Num inputs too wide for encoding channel!"};

			// For each output line, need to precalculate how big of an OR tree it needs
			// since can't presume that N = 2**Nc
			// First version however, just be hella lazy and presume N=2**Nc, 
			// connect extra nodes to ground (sorry)
			pint _N; // N rounded up to a power of 2
			_N = (1<<Nc);
			ortree<_N/2> ors_t[Nc];
			ortree<_N/2> ors_f[Nc];
			(i:Nc:ors_t[i].supply = supply; ors_t[i].out = out.d[i].t;)
			(i:Nc:ors_f[i].supply = supply; ors_f[i].out = out.d[i].f;)

			pint num_connected_t; // Number of guys already connected to the current OR tree
			pint num_connected_f;

			TIELO_X1 tielo(.vdd = supply.vdd, .vss = supply.vss); // I'm sorry
			pint bitval;
			(i:0..Nc-1: // For each output line
				num_connected_t = 0;
				num_connected_f = 0;
				(j:0.. _N-1:
					bitval = (j & ( 1 << i )) >> i; // Get binary digit of integer j, column i
					[bitval = 1 & j <= N-1->
						ors_t[i].in[num_connected_t] = in[j];
						num_connected_t = num_connected_t + 1;
					[] bitval = 0  & j <= N-1->
						ors_f[i].in[num_connected_f] = in[j];
						num_connected_f = num_connected_f + 1;
					[] bitval = 1 & j > N-1->
						ors_t[i].in[num_connected_t] = tielo.y;
						num_connected_t = num_connected_t + 1;
					[] bitval = 0  & j > N-1->
						ors_f[i].in[num_connected_f] = tielo.y;
						num_connected_f = num_connected_f + 1;
					]

					)


				)

		}


		/**
	   * Buffer function code.
	   * Is the function block ripped from the buffer_s.
	   * Used in the encoder2d.
	   */
    export template<pint N>
    defproc buffer_s_func (Mx1of2<N> in; avMx1of2<N> out; bool? in_v, en, reset_B; power supply) {
		    //function
		    bool _out_a_BX_t[N],_out_a_BX_f[N],_out_a_B,_en_X_t[N],_en_X_f[N], _in_vX, _in_vXX_t[N],_in_vXX_f[N];


		    A_2C2N_RB_X4 f_buf_func[N];
		    A_2C2N_RB_X4 t_buf_func[N];

		    // reset buffers
		    bool _reset_BX,_reset_BXX[N];
		    BUF_X1 reset_buf(.a=reset_B, .y=_reset_BX,.vdd=supply.vdd,.vss=supply.vss);
        sigbuf<N> reset_bufarray(.in=_reset_BX, .out=_reset_BXX, .supply=supply);
		    
		    // Enable signal buffers
		    sigbuf<N> en_buf_t(.in=en, .out=_en_X_t, .supply=supply);
		    sigbuf<N> en_buf_f(.in=en, .out=_en_X_f, .supply=supply);

		    // out ack signal buffers
		    INV_X1 out_a_inv(.a=out.a,.y=_out_a_B, .vss = supply.vss, .vdd = supply.vdd);
		    sigbuf<N> out_a_B_buf_f(.in=_out_a_B,.out=_out_a_BX_t, .supply=supply);
		    sigbuf<N> out_a_B_buf_t(.in=_out_a_B,.out=_out_a_BX_f, .supply=supply);

		    // in val signal buffers
		    BUF_X4 in_v_prebuf(.a = in_v, .y = _in_vX, .vss = supply.vss, .vdd = supply.vdd);
				sigbuf<N> in_v_buf_t(.in=_in_vX, .out=_in_vXX_t, .supply=supply);
		    sigbuf<N> in_v_buf_f(.in=_in_vX, .out=_in_vXX_f, .supply=supply);

		    (i:N: 
		        f_buf_func[i].y=out.d.d[i].f;
		        t_buf_func[i].y=out.d.d[i].t;
		        f_buf_func[i].c1=_en_X_f[i];
		        t_buf_func[i].c1=_en_X_t[i];
		        f_buf_func[i].c2=_out_a_BX_f[i];
		        t_buf_func[i].c2=_out_a_BX_t[i];
		        f_buf_func[i].n1=in.d[i].f;
		        t_buf_func[i].n1=in.d[i].t;
		        f_buf_func[i].n2=_in_vXX_f[i];
		        t_buf_func[i].n2=_in_vXX_t[i];
		        f_buf_func[i].vdd=supply.vdd;
		        t_buf_func[i].vdd=supply.vdd;
		        f_buf_func[i].vss=supply.vss;
		        t_buf_func[i].vss=supply.vss;
		        t_buf_func[i].pr_B = _reset_BXX[i];
		        t_buf_func[i].sr_B = _reset_BXX[i];
		        f_buf_func[i].pr_B = _reset_BXX[i];
		        f_buf_func[i].sr_B = _reset_BXX[i];
		    )
        
    }


		export template<pint NxC, NyC, Nx, Ny, ACK_STRENGTH>
		defproc encoder2d(a1of1 inx[Nx]; a1of1 iny[Ny]; avMx1of2<(NxC + NyC)> out; power supply; bool reset_B)		{
			// Reset buffers
			pint H = 2*(NxC + NyC); //Reset strength? to be investigated
			bool _reset_BX,_reset_BXX[H];
      BUF_X4 reset_buf(.a=reset_B, .y=_reset_BX,.vdd=supply.vdd,.vss=supply.vss);
      sigbuf<2*(NxC + NyC)> reset_bufarray(.in=_reset_BX, .out=_reset_BXX,.supply=supply);

			// Arbiters 
			a1of1 _arb_out_x, _arb_out_y;
			a1of1 _x_temp[Nx],_y_temp[Ny]; // For wiring the reqs to the arbtrees
			(i:Nx:
				_x_temp[i].r = inx[i].r;				
			)
			(i:Ny:
				_y_temp[i].r = iny[i].r;				
			)
			arbtree<Nx> Xarb(.in = _x_temp,.out = _arb_out_x,.supply = supply);
			arbtree<Ny> Yarb(.in = _y_temp,.out = _arb_out_y,.supply = supply);

			// Sigbufs for strong ackowledge signals from arb_in's
			sigbuf_1output<ACK_STRENGTH> x_ack_arb[Nx];
			sigbuf_1output<ACK_STRENGTH> y_ack_arb[Ny];
			(i:Nx:
				x_ack_arb[i].in = _x_temp[i].a;
				x_ack_arb[i].out = inx[i].a;
				x_ack_arb[i].supply = supply;
			)
			(i:Ny:
				y_ack_arb[i].in = _y_temp[i].a;
				y_ack_arb[i].out = iny[i].a;
				y_ack_arb[i].supply = supply;
			)

			// This block checks that the input is valid and that the arbiter made a choice
			// Then activates the ack of the arbiter
			bool _x_v,_in_x_v,_in_y_v,_x_a_B,_x_a;
			A_2C2P_RB_X1 Y_ack_confirm();
			Y_ack_confirm.p1 = _x_v;
			Y_ack_confirm.p2 =_in_x_v;
			Y_ack_confirm.c1 = _arb_out_y.r;
			Y_ack_confirm.c2 = _x_a_B;
			Y_ack_confirm.y = _arb_out_y.a;
			Y_ack_confirm.vdd = supply.vdd;
			Y_ack_confirm.vss = supply.vss;
			Y_ack_confirm.reset_B = _reset_BX;
			
			// This block checks that the input is valid and that the arbiter made a choice
			// Then activates the ack of the arbiter
			A_2C_RB_X1 X_ack_confirm();
			X_ack_confirm.c1 = _arb_out_x.r;
			X_ack_confirm.c2 = _x_a_B;
			X_ack_confirm.vdd = supply.vdd;
			X_ack_confirm.vss = supply.vss;
			X_ack_confirm.pr_B = _reset_BX;
			X_ack_confirm.sr_B = _reset_BX;
			X_ack_confirm.y = _arb_out_x.a;


			// X_req ORtree
			bool _x_req_array[Nx], _x_v_B;
			(i:Nx:_x_req_array[i] = inx[i].r;)
			ortree<Nx> x_req_ortree(.in = _x_req_array,.out = _x_v,.supply = supply); //todo BUFF
			INV_X1 not_x_req_ortree(.a = _x_v,.y = _x_v_B);

			//X_REQ validation
			// bool _x_req_array[Nx],_x_v_B, _en;
			// (i:Nx:_x_req_array[i] = x[i].r;)
			// ortree x_req_ortree(.in = _x_req_array,.out = _x_v,.supply = supply);
			// INV_X1 not_x_req_ortree(.a = _x_v,.y = _x_v_B);

			
			bool _en;
			A_1C3P2P2N_R_X1 x_ack(); // NEEDS BUFFERING TO X4
			//branch1
			x_ack.p4 = _in_x_v;
			x_ack.p5 = _x_v_B;
			//branch2
			x_ack.p1 = _in_x_v;
			x_ack.p2 = _in_y_v;
			x_ack.p3 = _x_v;
			//
			x_ack.c1 = _en;
			x_ack.n1 = out.v;
			x_ack.n2 = _in_x_v;
			//
			x_ack.y = _x_a_B;
			//
			x_ack.vdd = supply.vdd;
			x_ack.vss = supply.vss;
			x_ack.pr_B = _reset_BX;
			x_ack.sr_B = _reset_BX;

			INV_X1 not_x_ack(.a = _x_a_B, .y = _x_a, .vdd = supply.vdd, .vss = supply.vss);


			A_1C2P_X1 enabling(.p1 = out.a, .p2 = out.v, .c1 = _x_a, .y = _en, .vdd = supply.vdd, .vss = supply.vss);

			avMx1of2<(NxC + NyC)> _in_x;

			// Encoders
			bool x_acks[Nx];
			Mx1of2<NxC> x_enc_out;
			(i:Nx:x_acks[i] = inx[i].a;)
			dualrail_encoder<NxC, Nx> x_encoder(.in = x_acks, .out = x_enc_out, .supply = supply);

			bool y_acks[Ny];
			Mx1of2<NyC> y_enc_out;
			(i:Ny:y_acks[i] = iny[i].a;)
			dualrail_encoder<NyC, Ny> y_encoder(.in = y_acks, .out = y_enc_out, .supply = supply);

			// Valid trees
			vtree<NxC> vtree_x(.in = x_enc_out, .out = _in_x_v, .supply = supply);
			vtree<NyC> vtree_y(.in = y_enc_out, .out = _in_y_v, .supply = supply);

			// Buffer func thing 
			Mx1of2<NxC + NyC> into_buffer;
			(i:0..NxC-1:into_buffer.d[i] = x_enc_out.d[i];)
			(i:0..NyC-1:into_buffer.d[i+NxC] = y_enc_out.d[i];)
			AND2_X1 _in_xy_v(.a = _in_x_v, .b = _in_y_v, .vss = supply.vss, .vdd = supply.vdd);
			buffer_s_func<NxC + NyC> buf_s_func(.in = into_buffer, .out = out,
				.en = _en, .in_v = _in_xy_v.y, .supply = supply, .reset_B = reset_B);

		}


		/**
	   * Neuron handshaking.
	   * Looks for a rising edge on the neuron req.
	   * Then performs a 2d handshake out outy then outx.
	   */
		export
		defproc nrn_hs_2d(a1of1 in; a1of1 outx; a1of1 outy; power supply; bool reset_B)	{
			bool _reset_BX;
			BUF_X2 reset_buf(.a = reset_B, .y = _reset_BX, .vdd = supply.vdd, .vss = supply.vss);

			bool _en, _req;
			
			// A_1C2N_RB_X1 A_ack(.c1 = _en, .n1 = _req, .n2 = in.r, .y = in.a,
			// 	.pr_B = _reset_BX, .sr_B = _reset_BX, .vss = supply.vss, .vdd = supply.vdd);

			// Switched it back
			// Because had the problem that if the req was not removed in time,
			// it would be recounted as a double spike,
			// since in.req is still high after the out has been dealt with.
			A_2C1N_RB_X1 A_ack(.c1 = _en, .c2 = in.r, .n1 = _req, .y = in.a,
				.pr_B = _reset_BX, .sr_B = _reset_BX, .vss = supply.vss, .vdd = supply.vdd);


			A_1C1P_X1 A_en(.p1 = _req, .c1 = in.a, .y = _en,
				.vss = supply.vss, .vdd = supply.vdd);

			bool _y_a_B, _x_a_B;
			INV_X2 inv_x(.a = outx.a, .y = _x_a_B, .vss = supply.vss, .vdd = supply.vdd);
			INV_X2 inv_y(.a = outy.a, .y = _y_a_B, .vss = supply.vss, .vdd = supply.vdd);

			A_2C1P1N_RB_X1 A_req(.p1 = _x_a_B, .c1 = _en, .c2 = _y_a_B, .n1 = in.r, .y = _req,
				.pr_B = _reset_BX, .sr_B = _reset_BX, .vdd = supply.vdd, .vss = supply.vss);

			// y_req pull up
			NAND2_X1 nand_y(.a = _y_a_B, .b = _req, .vdd = supply.vdd, .vss = supply.vss);
			A_1P_U_X4 pu_y(.a = nand_y.y, .y = outy.r, .vdd = supply.vdd, .vss = supply.vss);
 
			// x_req pull up
			NAND3_X1 nand_x(.a = _x_a_B, .b = _req, .c = outy.a, .vdd = supply.vdd, .vss = supply.vss);
			A_1P_U_X4 pu_x(.a = nand_x.y, .y = outx.r, .vdd = supply.vdd, .vss = supply.vss);
		}


		export
    defproc nrn_line_end_pull_down (bool? in; bool? reset_B; power supply; bool! out)
    {
      bool _out, __out, nand_out;
      BUF_X1 buf1(.a=in, .y=_out, .vdd=supply.vdd,.vss=supply.vss);
      BUF_X1 buf2(.a=_out, .y=__out, .vdd=supply.vdd,.vss=supply.vss);
      INV_X1 inv(.a = __out, .vdd=supply.vdd,.vss =supply.vss);

      NAND2_X1 aenor(.a=inv.y, .b=reset_B, .y = nand_out, .vdd=supply.vdd,.vss=supply.vss);

      A_1N_U_X4 pull_down(.a=nand_out, .y=out);
    }


		/**
	   * A 2d grid of neuron handshakers.
	   * Should then slot into the encoder.
	   * Each neuron has an a1of1 channel (in), which is tripped when a neuron spikes.
	   * N_dly is number of delay elements to add to line pull down, 
	   * for the purpose of running ACT sims.
	   * It should probably be set to 0 though.
	   */
		export template<pint Nx, Ny, N_dly>
		defproc nrn_hs_2d_array(a1of1 in[Nx*Ny]; a1of1 outx[Nx], outy[Ny];
			power supply; bool reset_B) {

			// Make hella signal buffers
			sigbuf<Ny> rsbx(.in = reset_B, .supply = supply);
			sigbuf<Nx> rsb[Ny]; // ResetSigBuf
			(j:Ny:
				rsb[j].in = rsbx.out[j];
				rsb[j].supply = supply;
			)

			// Add buffers on output req lines
			a1of1 _outx[Nx], _outy[Ny];
			BUF_X4 out_req_buf_x[Nx];
			(i:Nx:
				out_req_buf_x[i].vss = supply.vss;
				out_req_buf_x[i].vdd = supply.vdd;
				out_req_buf_x[i].a = _outx[i].r;
				out_req_buf_x[i].y = outx[i].r;
			)
			BUF_X4 out_req_buf_y[Ny];
			(i:Ny:
				out_req_buf_y[i].vss = supply.vss;
				out_req_buf_y[i].vdd = supply.vdd;
				out_req_buf_y[i].a = _outy[i].r;
				out_req_buf_y[i].y = outy[i].r;
			)
			// Add buffers on output ack lines
			// Note that this should be generalised.
			// And probably won't even be done by ACT/innovus anwyay
			// TODO: do it properly with sigbufs?
			BUF_X4 out_ack_buf_x[Nx];
			(i:Nx:
				out_ack_buf_x[i].vss = supply.vss;
				out_ack_buf_x[i].vdd = supply.vdd;
				out_ack_buf_x[i].a = outx[i].a;
				out_ack_buf_x[i].y = _outx[i].a;
			)
			BUF_X4 out_ack_buf_y[Ny];
			(i:Ny:
				out_ack_buf_y[i].vss = supply.vss;
				out_ack_buf_y[i].vdd = supply.vdd;
				out_ack_buf_y[i].a = outy[i].a;
				out_ack_buf_y[i].y = _outy[i].a;
			)


			// Create handshake grid 
			pint index;
			nrn_hs_2d neurons[Nx*Ny];
			(i:0..Nx-1:
				(j:0..Ny-1:
					index = i + j*Nx;
					neurons[index].supply = supply;
					neurons[index].reset_B = rsb[j].out[i];
					neurons[index].in = in[index];
					neurons[index].outx = _outx[i];
					neurons[index].outy = _outy[j];
				)
			)

			// Create delay fifos to emulate the fact that the line pull downs
			// are at the end of the line, and thus slow.
			// Note that if N_dly = 0, delay fifo is just a pipe.
			delay_chain<N_dly> dly_x[Nx];
			delay_chain<N_dly> dly_y[Ny];	
			
			// Create x line req pull downs
			nrn_line_end_pull_down pd_x[Nx];
			sigbuf<Nx> rsb_pd_x(.in = reset_B, .supply = supply);
			(i:0..Nx-1:
				dly_x[i].supply = supply;
				dly_x[i].in = _outx[i].a;
				pd_x[i].in = dly_x[i].out;
				
				pd_x[i].out = _outx[i].r;
				pd_x[i].reset_B = rsb_pd_x.out[i];
				pd_x[i].supply = supply;
			)

			// Create y line req pull downs
			nrn_line_end_pull_down pd_y[Ny];
			sigbuf<Ny> rsb_pd_y(.in = reset_B, .supply = supply);
			(j:0..Ny-1:
				dly_y[j].supply = supply;
				dly_y[j].in = _outy[j].a;
				pd_y[j].in = dly_y[j].out;
			
				pd_y[j].out = _outy[j].r;
				pd_y[j].reset_B = rsb_pd_y.out[j];
				pd_y[j].supply = supply;
			)

			// Add keeps
			KEEP_X1 keep_x[Nx];
			(i:Nx:
				keep_x[i].vdd = supply.vdd;
				keep_x[i].vss = supply.vss;
				keep_x[i].y = _outx[i].r;
			)

			KEEP_X1 keep_y[Ny];
			(j:Ny:
				keep_y[j].vdd = supply.vdd;
				keep_y[j].vss = supply.vss;
				keep_y[j].y = _outy[j].r;
			)
		}


	}

}