/************************************************************************* * * This file is part of ACT dataflow neuro library * * Copyright (c) 2022 University of Groningen - Ole Richter * Copyright (c) 2022 University of Groningen - Michele Mastella * Copyright (c) 2022 University of Groningen - Hugh Greatorex * Copyright (c) 2022 University of Groningen - Madison Cotteret * * * This source describes Open Hardware and is licensed under the CERN-OHL-W v2 or later * * You may redistribute and modify this documentation and make products * using it under the terms of the CERN-OHL-W v2 (https:/cern.ch/cern-ohl). * This documentation is distributed WITHOUT ANY EXPRESS OR IMPLIED * WARRANTY, INCLUDING OF MERCHANTABILITY, SATISFACTORY QUALITY * AND FITNESS FOR A PARTICULAR PURPOSE. Please see the CERN-OHL-W v2 * for applicable conditions. * * Source location: https://git.web.rug.nl/bics/actlib_dataflow_neuro * * As per CERN-OHL-W v2 section 4.1, should You produce hardware based on * these sources, You must maintain the Source Location visible in its * documentation. * ************************************************************************** */ import "../../dataflow_neuro/cell_lib_async.act"; import "../../dataflow_neuro/cell_lib_std.act"; import "../../dataflow_neuro/treegates.act"; import "../../dataflow_neuro/primitives.act"; import "../../dataflow_neuro/coders.act"; // import tmpl::dataflow_neuro; // import tmpl::dataflow_neuro; import std::channel; open std::channel; namespace tmpl { namespace dataflow_neuro { // Circuit for storing registers using AER // The block has the parameters: // lognw -> log2(number of words), parameters you can store // wl -> word length, length of each word // N_dly_cfg -> the number of config bits in the ACK delay line // The block has the pins: // in -> input data, // - the first bit is write/read_B // - the next lognw bits describe the location, // - the last wl the word to write // data -> the data saved in the flip flop, sized wl x nw export template defproc register_w (avMx1of2<1+lognw+wl> in; d1of data[1< val_input(.in = in.d,.out = _in_v_temp, .supply = supply); sigbuf_1output<4> val_input_X(.in = _in_v_temp,.out = in.v,.supply = supply); // Generation of the fake clock pulse (inverted because the ff clocks are low_active) delayprog clk_dly(.in = _in_v_temp, .out = _clock_temp,.s = dly_cfg, .supply = supply); INV_X1 inv_clk(.a = _clock_temp,.y = _clock_temp_inv,.vdd = supply.vdd,.vss = supply.vss); sigbuf_1output<4> clk_X(.in = _clock_temp,.out = _clock,.supply = supply); // Sending back to the ackowledge delayprog ack_dly(.in = _clock_temp_inv, .out = _in_a_temp,.s = dly_cfg, .supply = supply); sigbuf_1output<4> ack_input_X(.in = _in_a_temp,.out = in.a,.supply = supply); //Reset Buffers bool _reset_BX,_reset_mem_BX,_reset_mem_BXX[nw*wl]; BUF_X1 reset_buf_BX(.a=reset_B, .y=_reset_BX,.vdd=supply.vdd,.vss=supply.vss); BUF_X1 reset_buf_BXX(.a=reset_mem_B, .y=_reset_mem_BX,.vdd=supply.vdd,.vss=supply.vss); sigbuf reset_bufarray(.in=_reset_mem_BX, .out=_reset_mem_BXX,.supply=supply); // Creating the different flip flop arrays bool _out_encoder[nw],_clock_word_temp[nw],_clock_word[nw],_clock_buffer_out[nw*wl]; andtree atree[nw]; AND2_X1 and_encoder[nw]; sigbuf clock_buffer[nw]; DFFQ_R_X1 ff[nw*wl]; pint bitval; (k:nw:atree[k].supply = supply;) (word_idx:nw: // Decoding the bit pattern to understand which word we are looking at (pin_idx:lognw: bitval = (word_idx & ( 1 << pin_idx )) >> pin_idx; // Get binary digit of integer i, column j [bitval = 1 -> atree[word_idx].in[pin_idx] = in.d.d[pin_idx+wl].t; [] bitval = 0 -> atree[word_idx].in[pin_idx] = in.d.d[pin_idx+wl].f; []bitval >= 2 -> {false : "fuck"}; ] ) // Activating the fake clock for the right word atree[word_idx].out = _out_encoder[word_idx]; and_encoder[word_idx].a = _out_encoder[word_idx]; and_encoder[word_idx].b = _clock; and_encoder[word_idx].y = _clock_word_temp[word_idx]; and_encoder[word_idx].vdd = supply.vdd; and_encoder[word_idx].vss = supply.vss; clock_buffer[word_idx].in = _clock_word_temp[word_idx]; clock_buffer[word_idx].supply = supply; // Describing all the FF and their connection (bit_idx:wl: ff[bit_idx+word_idx*(wl)].clk_B = clock_buffer[word_idx].out[bit_idx]; ff[bit_idx+word_idx*(wl)].d = in.d.d[bit_idx].t; ff[bit_idx+word_idx*(wl)].q = data[word_idx].d[bit_idx]; ff[bit_idx+word_idx*(wl)].reset_B = _reset_mem_BXX[bit_idx+word_idx*(wl)]; ff[bit_idx+word_idx*(wl)].vdd = supply.vdd; ff[bit_idx+word_idx*(wl)].vss = supply.vss; ) ) } // Circuit for storing and reading registers using AER // The block has the parameters: // lognw -> log2(number of words), parameters you can store // wl -> word length, length of each word // N_dly_cfg -> the number of config bits in the ACK delay line // The block has the pins: // in -> input data, // - the MSB is write/read_B // - the next MSB bits (size lognw) are the location, // - the LSB (size wl) are the word to write // out -> in case a reading phase is required, the output is used to show the stored data // - the MSB bits (size lognw) tell the read register // - the LSB bits (size wl) tell the word read // data -> the data saved in the flip flop, sized wl x nw export template defproc register_rw (avMx1of2<1+lognw+wl> in; avMx1of2 out; d1of data[1< val_input(.in = in.d,.out = _in_v_temp, .supply = supply); sigbuf_1output<12> val_input_X(.in = _in_v_temp,.out = in.v,.supply = supply); // Acknowledgment OR2_X1 ack_readwrite(.a = _in_a_write,.b = _in_a_read,.y = _in_a_temp,.vdd = supply.vdd,.vss = supply.vss); sigbuf_1output<12> ack_input_X(.in = _in_a_temp,.out = in.a,.supply = supply); // WRITE // Generation of the fake clock pulse if write is HIGH (inverted because the ff clocks are low_active) bool _in_v_temp_write; AND2_X1 clk_switch(.a = _in_v_temp,.b = in.d.d[lognw+wl].f,.y = _in_v_temp_write,.vdd = supply.vdd,.vss = supply.vss); delayprog clk_dly(.in = _in_v_temp_write, .out = _clock_temp,.s = dly_cfg, .supply = supply); INV_X1 inv_clk(.a = _clock_temp,.y = _clock_temp_inv,.vdd = supply.vdd,.vss = supply.vss); sigbuf clk_X(.in = _clock_temp_inv, .out = _clock,.supply = supply); sigbuf clock_buffer[nw]; bool _clock_word_temp[nw],_clock_word[nw],_clock_buffer_out[nw*wl]; // Sending back to the acknowledge bool _in_a_write_temp; delayprog ack_dly(.in = _clock_temp, .out = _in_a_write_temp,.s = dly_cfg, .supply = supply); AND2_X1 ack_write_and(.a = in.d.d[lognw+wl].f,.b = _in_a_write_temp,.y = _in_a_write,.vdd = supply.vdd, .vss = supply.vss); // READ //Outputing the word to read AND2_X1 word_to_read[nw]; sigbuf word_to_read_X[nw]; ortree bitselector_t[wl]; ortree bitselector_f[wl]; AND2_X1 word_selector_t[nw*wl]; AND2_X1 word_selector_f[nw*wl]; buffer_s output_buf(.out = out,.supply = supply, .reset_B = reset_B); AND2_X1 address_propagator_f[lognw],address_propagator_t[lognw]; // Outputting the address if the read is true (i:lognw: address_propagator_t[i].a = in.d.d[lognw+wl].t; address_propagator_t[i].b = in.d.d[i+wl].t; address_propagator_t[i].y = output_buf.in.d.d[i+wl].t; address_propagator_t[i].vdd = supply.vdd; address_propagator_t[i].vss = supply.vss; address_propagator_f[i].a = in.d.d[lognw+wl].t; address_propagator_f[i].b = in.d.d[i+wl].f; address_propagator_f[i].y = output_buf.in.d.d[i+wl].f; address_propagator_f[i].vdd = supply.vdd; address_propagator_f[i].vss = supply.vss; ) AND2_X1 ack_read_and(.a = in.d.d[lognw+wl].t,.b = output_buf.in.a,.y = _in_a_read,.vdd = supply.vdd, .vss = supply.vss); //Reset Buffers bool _reset_BX, _reset_BXX[nw],_reset_mem_BX,_reset_mem_BXX[nw*wl]; BUF_X1 reset_buf_BX(.a=reset_B, .y=_reset_BX,.vdd=supply.vdd,.vss=supply.vss); BUF_X1 reset_buf_BXX(.a=reset_mem_B, .y=_reset_mem_BX,.vdd=supply.vdd,.vss=supply.vss); sigbuf reset_mem_bufarray(.in=_reset_mem_BX, .out=_reset_mem_BXX,.supply=supply); sigbuf reset_bufarray(.in=_reset_BX, .out=_reset_BXX,.supply=supply); //Creating the encoder andtree atree[nw]; OR2_X1 or_encoder[nw]; INV_X1 inv_encoder[nw]; // Creating the different flip flop arrays bool _out_encoder[nw]; DFFQ_R_X1 ff[nw*wl]; AND2_X1 val_chck[nw*wl]; bool _val_chck_out[nw*wl]; bool _in_v_temp_buf[nw*wl]; sigbuf v_buf(.in = _in_v_temp,.out = _in_v_temp_buf,.supply = supply); // For loop for assigning the different components pint bitval; (k:nw:atree[k].supply = supply;) (word_idx:nw: // Decoding the bit pattern to understand which word we are looking at (pin_idx:lognw: bitval = (word_idx & ( 1 << pin_idx )) >> pin_idx; // Get binary digit of integer i, column j [bitval = 1 -> atree[word_idx].in[pin_idx] = in.d.d[pin_idx+wl].t; [] bitval = 0 -> atree[word_idx].in[pin_idx] = in.d.d[pin_idx+wl].f; []bitval >= 2 -> {false : "fuck"}; ] ) // WRITE: Activating the fake clock for the right word atree[word_idx].out = _out_encoder[word_idx]; inv_encoder[word_idx].a = _out_encoder[word_idx]; inv_encoder[word_idx].y = or_encoder[word_idx].a; inv_encoder[word_idx].vdd = supply.vdd; inv_encoder[word_idx].vss = supply.vss; or_encoder[word_idx].b = _clock[word_idx]; or_encoder[word_idx].y = _clock_word_temp[word_idx]; or_encoder[word_idx].vdd = supply.vdd; or_encoder[word_idx].vss = supply.vss; clock_buffer[word_idx].in = _clock_word_temp[word_idx]; clock_buffer[word_idx].supply = supply; // READ: Selecting the right word to read if read is high word_to_read[word_idx].a = in.d.d[lognw+wl].t; word_to_read[word_idx].b = _out_encoder[word_idx]; word_to_read[word_idx].y = word_to_read_X[word_idx].in; word_to_read[word_idx].vdd = supply.vdd; word_to_read[word_idx].vss = supply.vss; word_to_read_X[word_idx].supply = supply; (bit_idx:wl: // Describing all the FF and their connection val_chck[bit_idx].a = _in_v_temp_buf[word_idx+bit_idx]; val_chck[bit_idx].b = in.d.d[bit_idx].t; val_chck[bit_idx].y = _val_chck_out[bit_idx]; val_chck[bit_idx].vdd = supply.vdd; val_chck[bit_idx].vss = supply.vss; ff[bit_idx+word_idx*(wl)].clk_B = clock_buffer[word_idx].out[bit_idx]; ff[bit_idx+word_idx*(wl)].d = in.d.d[bit_idx].t; ff[bit_idx+word_idx*(wl)].q = data[word_idx].d[bit_idx]; ff[bit_idx+word_idx*(wl)].reset_B = _reset_mem_BXX[bit_idx+word_idx*(wl)]; ff[bit_idx+word_idx*(wl)].vdd = supply.vdd; ff[bit_idx+word_idx*(wl)].vss = supply.vss; // READ: creating the selectors for propagating the right word word_to_read_X[word_idx].out[bit_idx] = word_selector_t[bit_idx+(word_idx*(wl))].a; word_to_read_X[word_idx].out[bit_idx+wl] = word_selector_f[bit_idx+(word_idx*(wl))].a; word_selector_t[bit_idx+word_idx*(wl)].b = ff[bit_idx+(word_idx*(wl))].q; word_selector_t[bit_idx+word_idx*(wl)].y = bitselector_t[bit_idx].in[word_idx]; word_selector_f[bit_idx+word_idx*(wl)].b = ff[bit_idx+(word_idx*(wl))].q_B; word_selector_f[bit_idx+word_idx*(wl)].y = bitselector_f[bit_idx].in[word_idx]; bitselector_t[bit_idx].out = output_buf.in.d.d[bit_idx].t; bitselector_f[bit_idx].out = output_buf.in.d.d[bit_idx].f; bitselector_t[bit_idx].supply = supply; bitselector_f[bit_idx].supply = supply; ) ) } /** * Buffer for use in an A-cell register. * Basically the same as a normal buffer, except that when out.v goes high, * in.a goes high too. * Also, in.a does not wait for out.v to go low to go to low. * Means have a buffer that completes its Right handshake as soon as out data is valid. */ export template defproc buffer_register(avMx1of2 in; Mx1of2 out; bool? out_v, flush, reset_B; power supply) { // BIG TODO // I HAVE NOT BOTHERED WITH ANY SIGNAL BUFFERING IN HERE YET //control bool _en, _reset_BX,_reset_BXX[N]; bool _in_aB; bool _reset; INV_X1 reset_inv(.a = reset_B, .y = _reset); A_2C1N_R_X1 inack_ctl(.c1=_in_aB,.c2=in.v,.n1=out_v,.y=_in_aB, .pr_B=_reset_BX,.sr_B=_reset_BX,.vdd=supply.vdd,.vss=supply.vss); INV_X1 inack_inv(.a = _in_aB, .y = in.a, .vdd = supply.vdd, .vss = supply.vss); // A_1C1P_X1 en_ctl(.c1=in.a,.p1=out.v,.y=_en, // .vdd=supply.vdd,.vss=supply.vss); bool _flushB; INV_X1 flush_inv(.a = flush, .y = _flushB); // AND2_X1 flush_en(.a = _flushB, .b = _in_aB, .y = _en); _en = _in_aB; BUF_X1 reset_buf(.a=reset_B, .y=_reset_BX,.vdd=supply.vdd,.vss=supply.vss); sigbuf reset_bufarray(.in=_reset_BX, .out=_reset_BXX); //validity bool _in_v; vtree vc(.in=in.d,.out=_in_v,.supply=supply); BUF_X4 in_v_buf(.a=_in_v, .y=in.v,.vdd=supply.vdd,.vss=supply.vss); //function bool _out_a_BX_t[N],_out_a_BX_f[N],_out_a_B,_en_X_t[N],_en_X_f[N]; A_1C2N_RB_X4 f_buf_func[N]; A_1C2N_SB_X4 t_buf_func[N]; sigbuf en_buf_t(.in=_en, .out=_en_X_t, .supply=supply); sigbuf en_buf_f(.in=_en, .out=_en_X_f, .supply=supply); // INV_X1 out_a_inv(.a=out.a,.y=_out_a_B, .vss = supply.vss, .vdd = supply.vdd); // sigbuf out_a_B_buf_f(.in=_out_a_B,.out=_out_a_BX_t); // sigbuf out_a_B_buf_t(.in=_out_a_B,.out=_out_a_BX_f); // check if you can also do single var to array connect a=b[N] // and remove them from the loop (i:N: f_buf_func[i].y=out.d[i].f; t_buf_func[i].y=out.d[i].t; f_buf_func[i].c1=_flushB; t_buf_func[i].c1=_flushB; f_buf_func[i].n2=_en_X_f[i]; t_buf_func[i].n2=_en_X_t[i]; // f_buf_func[i].c2=_out_a_BX_f[i]; // t_buf_func[i].c2=_out_a_BX_t[i]; f_buf_func[i].n1=in.d.d[i].f; t_buf_func[i].n1=in.d.d[i].t; f_buf_func[i].vdd=supply.vdd; t_buf_func[i].vdd=supply.vdd; f_buf_func[i].vss=supply.vss; t_buf_func[i].vss=supply.vss; t_buf_func[i].pr = _reset; t_buf_func[i].sr = _reset; f_buf_func[i].pr_B = _reset_BXX[i]; f_buf_func[i].sr_B = _reset_BXX[i]; ) } /** * A single register made out of A cells. * last bit is whether to read or write. * Currently only handles writing. */ export template defproc registerA(avMx1of2 in; Mx1of2 out; bool? reset_B; power supply) { // BIG TODO // I HAVE NOT BOTHERED WITH ANY SIGNAL BUFFERING IN HERE YET bool _en2; bool _w; bool _out_v, _out_vB; bool _flush, _flushB; _w = in.d.d[N].t; // Buffer buffer_register buf(.out = out, .out_v = _out_v, .flush = _flush, .supply = supply, .reset_B = reset_B); buf.in.v = in.v; // In ack stuff INV_X1 in_ack_inv(.a = buf.in.a, .vdd = supply.vdd, .vss = supply.vss); // To stop in ack going low before en2 has been reset. A_1C1N_X1 in_ack_safety(.c1 = in_ack_inv.y, .n1 = _en2, .y = in.a, .vdd = supply.vdd, .vss = supply.vss); // Out valid tree vtree out_valid(.in = buf.out, .out = _out_v, .supply = supply); INV_X2 out_val_inv(.a = _out_v, .y = _out_vB, .vdd = supply.vdd, .vss=supply.vss); // Control A_1C1P2N_RB_X1 A_flush(.c1 = _en2, .n1 = _out_v, .n2 = _w, .p1 = _flushB, .y = _flush, .vdd = supply.vdd, .vss = supply.vss, .sr_B = reset_B, .pr_B = reset_B); INV_X2 flush_inv(.a = _flush, .y = _flushB, .vdd = supply.vdd, .vss = supply.vss); A_1C2N_R_X1 A_en2(.c1 = _w, .n1 = _en2, .n2 = _out_vB, .y = _en2, .pr_B = reset_B, .sr_B = reset_B); // Pass to let data into the buffer NOR2_X1 pass(.a = _en2, .b = _flush, .vss = supply.vss, .vdd = supply.vdd); AND2_X1 gandalf_t[N]; AND2_X1 gandalf_f[N]; (i:0..N-1: gandalf_t[i].a = in.d.d[i].t; gandalf_f[i].a = in.d.d[i].f; gandalf_t[i].b = pass.y; gandalf_f[i].b = pass.y; gandalf_t[i].y = buf.in.d.d[i].t; gandalf_f[i].y = buf.in.d.d[i].f; gandalf_t[i].vdd = supply.vdd; gandalf_f[i].vdd = supply.vdd; gandalf_t[i].vss = supply.vss; gandalf_f[i].vss = supply.vss; ) } /** * Array of registers made out of A-cells * params: * NcW: number of bits in Words to be stored in buffers * NcA: number of bits in Address * M: number of registers. M = 2^Nc_addr would be a natural choice. * Input packets should be * [-addr-][-word-][r/w] */ export template defproc registerA_w_array(avMx1of2 in; Mx1of2 data[M]; bool? reset_B; power supply) { // BIG TODO // I HAVE NOT BOTHERED WITH ANY SIGNAL BUFFERING IN HERE YET // Input valid tree // Note that I may need to check the validity of other downstream stuff, // to be ultra careful about delays. // e.g. TODO add validity checking on the selector signals. vtree input_valid(.in = in.d, .out = in.v, .supply = supply); // Address decoder decoder_dualrail decoder(.supply = supply); (i:NcA: decoder.in.d[i] = in.d.d[i]; ) // OrTree over acks from all registers ortree ack_ortree(.supply = supply); // C element handling in ack A_2C_B_X1 in_ack_Cel(.c1 = ack_ortree.out, .c2 = input_valid.out, .y = in.a, .vss = supply.vss, .vdd = supply.vdd); // Write bit selector bool _w = in.d.d[NcA+NcW].t; AND2_X1 write_selectors[M]; (i:M: write_selectors[i].a = _w; write_selectors[i].b = decoder.out[i]; write_selectors[i].vdd = supply.vdd; write_selectors[i].vss = supply.vss; ) // Registers registerA registers[M]; TIELO_X1 tielow_writebit_f[M]; (i:M: // Connect each register to word inputs. (j:NcW: registers[i].in.d.d[j] = in.d.d[j + NcA]; ) // Connect the (selected) write bit registers[i].in.d.d[NcW].t = write_selectors[i].y; tielow_writebit_f[i].vdd = supply.vdd; tielow_writebit_f[i].vss = supply.vss; registers[i].in.d.d[NcW].f = tielow_writebit_f[i].y; // Connect to ack ortree registers[i].in.a = ack_ortree.in[i]; // Connect outputs data[i] = registers[i].out; registers[i].supply = supply; registers[i].reset_B = reset_B; ) } }}