MatchLib
Loading...
Searching...
No Matches
arbitrated_crossbar.h
1/*
2 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License")
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#ifndef ARBITRATED_CROSSBAR_H
17#define ARBITRATED_CROSSBAR_H
18
19#include <nvhls_types.h>
20#include <fifo.h>
21#include <Arbiter.h>
22#include <one_hot_to_bin.h>
23#include <hls_globals.h>
24#include <nvhls_marshaller.h>
25#include <nvhls_message.h>
26
27#pragma map_to_operator [CCORE]
28#pragma ccore_type combinational
29template<int NumInputs, int NumOutputs>
30void transpose(NVUINTW(NumOutputs) (&requests_transpose)[NumInputs], NVUINTW(NumInputs) (&requests)[NumOutputs]) {
31#pragma hls_unroll yes
32 for (unsigned out = 0; out < NumOutputs; out++) {
33#pragma hls_unroll yes
34 for (unsigned in = 0; in < NumInputs; in++) {
35 requests[out][in] = requests_transpose[in][out];
36 }
37 }
38}
39
40#pragma map_to_operator [CCORE]
41#pragma ccore_type combinational
42template<int NumInputs>
43NVUINTW(NumInputs) arbiter_pick(NVUINTW(NumInputs) request, Arbiter<NumInputs> arbiter_in, Arbiter<NumInputs>& arbiter_out) {
44 NVUINTW(NumInputs) one_hot_grant = arbiter_in.pick(request);
45 arbiter_out = arbiter_in;
46 return one_hot_grant;
47}
48
49// Need to add in virtual output queueing at the input. Can replace the input
50// FIFOs with a Queue class which internally has multiple parallel FIFOs and
51// keeps track of where to put each new input
86template <typename DataType, unsigned int NumInputs, unsigned int NumOutputs,
87 unsigned int LenInputBuffer, unsigned int LenOutputBuffer>
89
90 public:
91 // Define int types for input and output indices
92 static const int log2_inputs = nvhls::index_width<NumInputs>::val;
93 static const int log2_outputs = nvhls::index_width<NumOutputs>::val;
94
95 typedef NVUINTW(log2_inputs) InputIdx;
96 typedef NVUINTW(log2_outputs) OutputIdx;
97 #ifdef SKIP_LV2TYPE
98 typedef NVUINTW(Wrapped<DataType>::width + log2_outputs) DataDestType;
99 #endif
100
101 private:
102 // Convenience class which stores a data and destination
103 // This is what is stored in the input FIFOs
104 class DataDest : public nvhls_message {
105 public:
106 DataType data;
107 OutputIdx dest;
108 static const int width = Wrapped<DataType>::width + log2_outputs;
109 #ifdef SKIP_LV2TYPE
110 DataDestType data_dest;
111
112 void update_data_dest() {
113 data_dest = static_cast<DataDestType> (data) + (static_cast<DataDestType> (dest) << Wrapped<DataType>::width);
114 }
115
116 void extract_data_dest() {
117 data = static_cast<DataType> (data_dest);
118 dest = static_cast<OutputIdx> (data_dest >> Wrapped<DataType>::width);
119 }
120 #endif
121 template <unsigned int Size>
122 void Marshall(Marshaller<Size>& m) {
123 #ifdef SKIP_LV2TYPE
124 m& data_dest;
125 #else
126 m& data;
127 m& dest;
128 #endif
129 }
130 };
131
132 // Input + output FIFOs, arbiters
133 #ifndef SKIP_LV2TYPE
135 #else
137 #endif
139
140 Arbiter<NumInputs> arbiters[NumOutputs];
141
142 public:
143 ArbitratedCrossbar() { reset(); }
144
145 // reset function: reset all the queues
146 void reset() {
147#pragma hls_unroll yes
148 for (unsigned in = 0; in < NumInputs; in++) {
149 input_queues.reset();
150 }
151#pragma hls_unroll yes
152 for (unsigned out = 0; out < NumOutputs; out++) {
153 output_queues.reset();
154 arbiters[out].reset();
155 }
156 }
157
158 // The next few functions report status of a given input or output lane
159 bool isInputEmpty(InputIdx index) {
160 NVHLS_ASSERT_MSG(index <= NumInputs, "Input index greater than number of inputs");
161 return input_queues.isEmpty(index);
162 }
163
164 bool isOutputEmpty(OutputIdx index) {
165 NVHLS_ASSERT_MSG(index <= NumOutputs, "Output index greater than number of outputs");
166 return output_queues.isEmpty(index);
167 }
168
169 bool isInputFull(InputIdx index) {
170 NVHLS_ASSERT_MSG(index <= NumInputs, "Input index greater than number of inputs");
171 return input_queues.isFull(index);
172 }
173
174 bool isOutputFull(OutputIdx index) {
175 NVHLS_ASSERT_MSG(index <= NumOutputs, "Output index greater than number of outputs");
176 return output_queues.isFull(index);
177 }
178
179 // Add data to a specified input lane, with a specified destination lane
180 void push(DataType data, InputIdx src, OutputIdx dest) {
181 DataDest tmp;
182 tmp.data = data;
183 tmp.dest = dest;
184 #ifndef SKIP_LV2TYPE
185 input_queues.push(tmp, src);
186 #else
187 tmp.update_data_dest();
188 DataDestType tmp_data_dest;
189 tmp_data_dest = tmp.data_dest;
190 input_queues.push(tmp_data_dest, src);
191 #endif
192 }
193
194 DataType peek(OutputIdx index) { return output_queues.peek(index); }
195
196 // Pop the data from a specified output lane
197 DataType pop(OutputIdx index) { return output_queues.pop(index); }
198
199 // Run the crossbar (not the queues)
200//#pragma map_to_operator [CCORE]
201//#pragma ccore_type combinational
202 void xbar(DataDest input_data[NumInputs], bool input_valid[NumInputs],
203 bool input_consumed[NumInputs], DataType data_out[NumOutputs],
204 bool valid_out[NumOutputs], bool output_ready[NumOutputs], InputIdx source[NumOutputs]) {
205
206 // there is valid_out, so items that are not assigned are don't care
207 DataType data_out_tmp[NumOutputs];
208 InputIdx source_tmp[NumOutputs];
209// bool input_consumed_tmp[NumOutputs];
210 NVUINTW(NumInputs) input_consumed_tmp = 0;
211 bool valid_out_tmp[NumOutputs];
212
213 // For each input lane, read the data at the head of the queue, and store it
214 // in a temporary array
215 // Also check the destination requested and form a request matrix (along
216 // with its transpose)
217 // Doing this facilitates writing it in one dimension and reading it in
218 // another
219 NVUINTW(NumInputs) requests[NumOutputs]; // requests for all outputs
220
221 NVUINTW(NumOutputs) requests_transpose[NumInputs];
222
223#pragma hls_unroll yes
224 for (unsigned in = 0; in < NumInputs; in++) {
225 // The request from one input lane is just a one-hot representation of its
226 // destination field
227 // but should be 0 if this input lane is empty
228 NVUINTW(NumOutputs) empty = input_valid[in];
229 empty <<= input_data[in].dest;
230 requests_transpose[in] = empty;
231 }
232
233// Form transpose request matrix
234 transpose<NumInputs,NumOutputs>(requests_transpose, requests);
235
236// Keep track of which input queues need to be popped
237// This signal is the OR of all grant bits sent by output lanes to this
238// input lane
239// Need to do this so that all queues are popped once at the end in a input
240// lane loop
241// This removes a bottleneck in HLS where it was inferring a dependency
242// among pops
243#pragma hls_unroll yes
244 for (unsigned in = 0; in < NumInputs; in++) {
245 input_consumed_tmp[in] = false;
246 }
247
248// Loop over output lanes: run arbiter, then resolve contention
249#pragma hls_unroll yes
250 for (unsigned out = 0; out < NumOutputs; out++) {
251 valid_out_tmp[out] = false;
252 source_tmp[out] = 0;
253
254 if (output_ready[out]) {
255 NVUINTW(NumInputs) one_hot_grant = 0;
256 InputIdx source_local;
257
258 // Stall the arbiters and the crossbar if the output is full
259 // This is also needed to get any pipelining (otherwise the tool will
260 // infer that you want to write in a single cycle)
261 // For some reason separating these two if statements gives better results
262
263 // Run through the Arbiter pick() function, convert to binary
264 one_hot_grant = arbiter_pick<NumInputs>(requests[out], arbiters[out], arbiters[out]);
265 // one_hot_grant = arbiters[out].pick(requests[out]);
266 one_hot_to_bin<NumInputs, log2_inputs>(one_hot_grant, source_local);
267 // Grant logic on input queues (OR gate)
268 input_consumed_tmp |= one_hot_grant;
269 // XBAR (using the data that was staged in the temporary array input_data)
270 if(!(one_hot_grant == 0)) {
271 data_out_tmp[out] = input_data[source_local].data;
272 valid_out_tmp[out] = true;
273 source_tmp[out] = source_local;
274 }
275 }
276 }
277#pragma hls_unroll yes
278 for(unsigned int k=0; k < NumOutputs; k++) {
279 data_out[k] = data_out_tmp[k];
280 source[k] = source_tmp[k];
281 valid_out[k] = valid_out_tmp[k];
282 }
283#pragma hls_unroll yes
284 for(unsigned int k=0; k < NumInputs; k++) {
285 input_consumed[k] = input_consumed_tmp[k];
286 }
287 } // end xbar() function
288
289 public:
290 bool isAllInputEmpty() {
291 bool fifo_empty_internal[NumInputs + 1];
292 fifo_empty_internal[0] = true;
293#pragma hls_unroll yes
294 for (unsigned i = 0; i < NumInputs; i++) {
295 fifo_empty_internal[i + 1] = (isInputEmpty(i)) & fifo_empty_internal[i];
296 }
297 return fifo_empty_internal[NumInputs];
298 }
299
300 bool isAllOutputEmpty() {
301 bool fifo_empty_internal[NumOutputs + 1];
302 fifo_empty_internal[0] = true;
303#pragma hls_unroll yes
304 for (unsigned i = 0; i < NumOutputs; i++) {
305 fifo_empty_internal[i + 1] = (isOutputEmpty(i)) & fifo_empty_internal[i];
306 }
307 return fifo_empty_internal[NumOutputs];
308 }
309
310 bool isAllInputReady() {
311 bool fifo_ready_internal[NumInputs + 1];
312 fifo_ready_internal[0] = true;
313#pragma hls_unroll yes
314 for (unsigned i = 0; i < NumInputs; i++) {
315 fifo_ready_internal[i + 1] = (!isInputFull(i)) & fifo_ready_internal[i];
316 }
317 return fifo_ready_internal[NumInputs];
318 }
319
320 // Pop the data from all selected output lanes, data is already got from peek
321 void pop_all_lanes(bool valid_out[NumOutputs]) {
322#pragma hls_unroll yes
323 for (unsigned i = 0; i < NumOutputs; i++) {
324 if (valid_out[i]) {
325 output_queues.pop(i);
326 }
327 }
328 return;
329 }
335 // Top-Level function for Arbitrated Crossbar that returns source
336 // Interface description
337 // data_in - Array of inputs containing data
338 // dest_in - Array of inputs containing destination information for each input
339 // valid_in - Array of inputs indicating if the input is valid
340 // data_out - Array of outputs containing data
341 // valid_out - Array of outputs indicating if the output is valid
342 // ready - Array of outputs indicating if an input was ready. This also
343 // indicates of the input was successfully received by arbitrated Xbar
344 void run(DataType data_in[NumInputs], OutputIdx dest_in[NumInputs],
345 bool valid_in[NumInputs], DataType data_out[NumOutputs],
346 bool valid_out[NumOutputs], bool ready[NumInputs], InputIdx source[NumOutputs]
347 ) {
348 // Need to read data into temporary variables to avoid scheduling problem in
349 // Catapult
350 OutputIdx destin_tmp[NumInputs];
351 bool valid_in_tmp[NumInputs];
352
353#pragma hls_unroll yes
354 for (unsigned in = 0; in < NumInputs; in++) {
355 destin_tmp[in] = dest_in[in];
356 valid_in_tmp[in] = valid_in[in];
357 }
358
359 DataDest input_data[NumInputs];
360 bool input_valid[NumInputs];
361 bool input_consumed[NumInputs];
362#pragma hls_unroll yes
363 for (unsigned i = 0; i < NumInputs; i++) {
364 input_data[i] = BitsToType<DataDest>(0);
365 }
366 DataType output_data[NumOutputs];
367 bool output_valid[NumOutputs];
368 bool output_ready[NumOutputs];
369#pragma hls_unroll yes
370 for (unsigned i = 0; i < NumOutputs; i++) {
371 output_data[i] = BitsToType<DataType>(0);
372 }
373
374 if (LenInputBuffer > 0) {
375// If lane is ready and input data is valid, write to it
376#pragma hls_unroll yes
377 for (unsigned in = 0; in < NumInputs; in++) {
378 ready[in] = !isInputFull(in) || !valid_in_tmp[in];
379 if (!isInputFull(in) & valid_in_tmp[in]) {
380 // push(datain_tmp[in], in, destin_tmp[in]);
381 push(data_in[in], in, destin_tmp[in]);
382 }
383 input_valid[in] = !isInputEmpty(in);
384 if (input_valid[in]) {
385 #ifndef SKIP_LV2TYPE
386 input_data[in] = input_queues.peek(in);
387 #else
388 input_data[in].data_dest = input_queues.peek(in);
389 input_data[in].extract_data_dest();
390 #endif
391 }
392 }
393
394 } else {
395#pragma hls_unroll yes
396 for (unsigned in = 0; in < NumInputs; in++) {
397 input_data[in].data = data_in[in];
398 input_data[in].dest = dest_in[in];
399 input_valid[in] = valid_in_tmp[in];
400 }
401 }
402 for (unsigned in = 0; in < NumInputs; in++) {
403 //DCOUT("DUT - Input: " << in << "\t valid: " << input_valid[in] << "\t dest: " << input_data[in].dest << "\t data: " << input_data[in].data << endl);
404 }
405 if (LenOutputBuffer > 0) {
406#pragma hls_unroll yes
407 for (unsigned out = 0; out < NumOutputs; out++) {
408 output_ready[out] = !isOutputFull(out);
409 }
410 } else {
411#pragma hls_unroll yes
412 for (unsigned out = 0; out < NumOutputs; out++) {
413 output_ready[out] = true;
414 }
415 }
416
417 // Process the XBAR and arbiters
418 xbar(input_data, input_valid, input_consumed, output_data, output_valid,
419 output_ready, source);
420 for (unsigned out = 0; out < NumOutputs; out++) {
421 //DCOUT("DUT - Output: " << out << "\t valid: " << output_valid[out] << "\t data: " << output_data[out] << "\tReady: " << output_ready[out] << endl);
422 }
423
424 if (LenInputBuffer > 0) {
425// Increment the head pointer of the input queues if their requests were granted
426// Do this in a single loop over input lanes so it's clear to the tool that each
427// input channel
428// is only popped once
429#pragma hls_unroll yes
430 for (unsigned in = 0; in < NumInputs; in++) {
431 if (input_consumed[in]) {
432 input_queues.incrHead(in);
433 }
434 }
435 } else {
436#pragma hls_unroll yes
437 for (unsigned in = 0; in < NumInputs; in++) {
438 ready[in] = input_consumed[in];
439 }
440 }
441
442 if (LenOutputBuffer > 0) {
443// Read from each output channel if it is not empty
444#pragma hls_unroll yes
445 for (unsigned out = 0; out < NumOutputs; out++) {
446 if (output_valid[out]) {
447 output_queues.push(output_data[out], out);
448 }
449 valid_out[out] = !isOutputEmpty(out);
450 if (!isOutputEmpty(out)) {
451 data_out[out] = peek(out);
452 }
453 /*peek only
454 if (!isOutputEmpty(out)) {
455 data_out[out] = pop(out);
456 }
457 */
458 }
459 for (unsigned out = 0; out < NumOutputs; out++) {
460 //DCOUT("DUT - ArbXbar Output: " << out << "\t valid: " << valid_out[out] << "\t data: " << data_out[out] << endl);
461 }
462 } else {
463#pragma hls_unroll yes
464 for (unsigned out = 0; out < NumOutputs; out++) {
465 data_out[out] = output_data[out];
466 valid_out[out] = output_valid[out];
467 }
468 }
469 } // end run() function
475 void run(DataType data_in[NumInputs], OutputIdx dest_in[NumInputs],
476 bool valid_in[NumInputs], DataType data_out[NumOutputs],
477 bool valid_out[NumOutputs], bool ready[NumInputs]) {
478 InputIdx source[NumOutputs];
479 run(data_in, dest_in,
480 valid_in, data_out,
481 valid_out, ready, source);
482 } // end run() function
483
484
485}; // end ArbitratedCrossbar class
486
487#endif // end ARBITRATED_CROSSBAR_H
A generalized implementation of generic n-way roundrobin arbiter.
Definition Arbiter.h:61
Crossbar with conflict arbitration and input queuing.
Configurable FIFO class.
Definition fifo.h:65
void run(DataType data_in[NumInputs], OutputIdx dest_in[NumInputs], bool valid_in[NumInputs], DataType data_out[NumOutputs], bool valid_out[NumOutputs], bool ready[NumInputs], InputIdx source[NumOutputs])
Top-Level function for Arbitrated Crossbar.
void run(DataType data_in[NumInputs], OutputIdx dest_in[NumInputs], bool valid_in[NumInputs], DataType data_out[NumOutputs], bool valid_out[NumOutputs], bool ready[NumInputs])
Top-Level function for Arbitrated Crossbar that does not return source.
#define NVHLS_ASSERT_MSG(X, MSG)
#define NVUINTW(width)
Definition nvhls_types.h:35
Compute index width of a constant.
Definition nvhls_int.h:285