MatchLib
Loading...
Searching...
No Matches
arbitrated_crossbar.h
1/*
2 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License")
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#ifndef ARBITRATED_CROSSBAR_H
17#define ARBITRATED_CROSSBAR_H
18
19#include <nvhls_types.h>
20#include <fifo.h>
21#include <Arbiter.h>
22#include <one_hot_to_bin.h>
23#include <hls_globals.h>
24#include <nvhls_marshaller.h>
25#include <nvhls_message.h>
26
27#pragma map_to_operator [CCORE]
28#pragma ccore_type combinational
29template<int NumInputs, int NumOutputs>
30void transpose(NVUINTW(NumOutputs) (&requests_transpose)[NumInputs], NVUINTW(NumInputs) (&requests)[NumOutputs]) {
31#pragma hls_unroll yes
32 for (unsigned out = 0; out < NumOutputs; out++) {
33#pragma hls_unroll yes
34 for (unsigned in = 0; in < NumInputs; in++) {
35 requests[out][in] = requests_transpose[in][out];
36 }
37 }
38}
39
40// Need to add in virtual output queueing at the input. Can replace the input
41// FIFOs with a Queue class which internally has multiple parallel FIFOs and
42// keeps track of where to put each new input
77template <typename DataType, unsigned int NumInputs, unsigned int NumOutputs,
78 unsigned int LenInputBuffer, unsigned int LenOutputBuffer>
80
81 public:
82 // Define int types for input and output indices
83 static const int log2_inputs = nvhls::index_width<NumInputs>::val;
84 static const int log2_outputs = nvhls::index_width<NumOutputs>::val;
85
86 typedef NVUINTW(log2_inputs) InputIdx;
87 typedef NVUINTW(log2_outputs) OutputIdx;
88 #ifdef SKIP_LV2TYPE
89 typedef NVUINTW(Wrapped<DataType>::width + log2_outputs) DataDestType;
90 #endif
91
92 private:
93 // Convenience class which stores a data and destination
94 // This is what is stored in the input FIFOs
95 class DataDest : public nvhls_message {
96 public:
97 DataType data;
98 OutputIdx dest;
99 static const int width = Wrapped<DataType>::width + log2_outputs;
100 #ifdef SKIP_LV2TYPE
101 DataDestType data_dest;
102
103 void update_data_dest() {
104 data_dest = static_cast<DataDestType> (data) + (static_cast<DataDestType> (dest) << Wrapped<DataType>::width);
105 }
106
107 void extract_data_dest() {
108 data = static_cast<DataType> (data_dest);
109 dest = static_cast<OutputIdx> (data_dest >> Wrapped<DataType>::width);
110 }
111 #endif
112 template <unsigned int Size>
113 void Marshall(Marshaller<Size>& m) {
114 #ifdef SKIP_LV2TYPE
115 m& data_dest;
116 #else
117 m& data;
118 m& dest;
119 #endif
120 }
121 };
122
123 // Input + output FIFOs, arbiters
124 #ifndef SKIP_LV2TYPE
126 #else
128 #endif
130
131 Arbiter<NumInputs> arbiters[NumOutputs];
132
133 public:
134 ArbitratedCrossbar() { reset(); }
135
136 // reset function: reset all the queues
137 void reset() {
138#pragma hls_unroll yes
139 for (unsigned in = 0; in < NumInputs; in++) {
140 input_queues.reset();
141 }
142#pragma hls_unroll yes
143 for (unsigned out = 0; out < NumOutputs; out++) {
144 output_queues.reset();
145 arbiters[out].reset();
146 }
147 }
148
149 // The next few functions report status of a given input or output lane
150 bool isInputEmpty(InputIdx index) {
151 NVHLS_ASSERT_MSG(index <= NumInputs, "Input index greater than number of inputs");
152 return input_queues.isEmpty(index);
153 }
154
155 bool isOutputEmpty(OutputIdx index) {
156 NVHLS_ASSERT_MSG(index <= NumOutputs, "Output index greater than number of outputs");
157 return output_queues.isEmpty(index);
158 }
159
160 bool isInputFull(InputIdx index) {
161 NVHLS_ASSERT_MSG(index <= NumInputs, "Input index greater than number of inputs");
162 return input_queues.isFull(index);
163 }
164
165 bool isOutputFull(OutputIdx index) {
166 NVHLS_ASSERT_MSG(index <= NumOutputs, "Output index greater than number of outputs");
167 return output_queues.isFull(index);
168 }
169
170 // Add data to a specified input lane, with a specified destination lane
171 void push(DataType data, InputIdx src, OutputIdx dest) {
172 DataDest tmp;
173 tmp.data = data;
174 tmp.dest = dest;
175 #ifndef SKIP_LV2TYPE
176 input_queues.push(tmp, src);
177 #else
178 tmp.update_data_dest();
179 DataDestType tmp_data_dest;
180 tmp_data_dest = tmp.data_dest;
181 input_queues.push(tmp_data_dest, src);
182 #endif
183 }
184
185 DataType peek(OutputIdx index) { return output_queues.peek(index); }
186
187 // Pop the data from a specified output lane
188 DataType pop(OutputIdx index) { return output_queues.pop(index); }
189
190 // Run the crossbar (not the queues)
191#pragma map_to_operator [CCORE]
192#pragma ccore_type combinational
193 void xbar(DataDest input_data[NumInputs], bool input_valid[NumInputs],
194 bool input_consumed[NumInputs], DataType data_out[NumOutputs],
195 bool valid_out[NumOutputs], bool output_ready[NumOutputs], InputIdx source[NumOutputs]) {
196
197 // there is valid_out, so items that are not assigned are don't care
198 DataType data_out_tmp[NumOutputs];
199 InputIdx source_tmp[NumOutputs];
200// bool input_consumed_tmp[NumOutputs];
201 NVUINTW(NumInputs) input_consumed_tmp = 0;
202 bool valid_out_tmp[NumOutputs];
203
204 // For each input lane, read the data at the head of the queue, and store it
205 // in a temporary array
206 // Also check the destination requested and form a request matrix (along
207 // with its transpose)
208 // Doing this facilitates writing it in one dimension and reading it in
209 // another
210 NVUINTW(NumInputs) requests[NumOutputs]; // requests for all outputs
211
212 NVUINTW(NumOutputs) requests_transpose[NumInputs];
213
214#pragma hls_unroll yes
215 for (unsigned in = 0; in < NumInputs; in++) {
216 // The request from one input lane is just a one-hot representation of its
217 // destination field
218 // but should be 0 if this input lane is empty
219 NVUINTW(NumOutputs) empty = input_valid[in];
220 empty <<= input_data[in].dest;
221 requests_transpose[in] = empty;
222 }
223
224// Form transpose request matrix
225 transpose<NumInputs,NumOutputs>(requests_transpose, requests);
226
227// Keep track of which input queues need to be popped
228// This signal is the OR of all grant bits sent by output lanes to this
229// input lane
230// Need to do this so that all queues are popped once at the end in a input
231// lane loop
232// This removes a bottleneck in HLS where it was inferring a dependency
233// among pops
234#pragma hls_unroll yes
235 for (unsigned in = 0; in < NumInputs; in++) {
236 input_consumed_tmp[in] = false;
237 }
238
239// Loop over output lanes: run arbiter, then resolve contention
240#pragma hls_unroll yes
241 for (unsigned out = 0; out < NumOutputs; out++) {
242 valid_out_tmp[out] = false;
243 source_tmp[out] = 0;
244
245 if (output_ready[out]) {
246 NVUINTW(NumInputs) one_hot_grant = 0;
247 InputIdx source_local;
248
249 // Stall the arbiters and the crossbar if the output is full
250 // This is also needed to get any pipelining (otherwise the tool will
251 // infer that you want to write in a single cycle)
252 // For some reason separating these two if statements gives better results
253
254 // Run through the Arbiter pick() function, convert to binary
255 one_hot_grant = arbiters[out].pick(requests[out]);
256 one_hot_to_bin<NumInputs, log2_inputs>(one_hot_grant, source_local);
257 // Grant logic on input queues (OR gate)
258 input_consumed_tmp |= one_hot_grant;
259 // XBAR (using the data that was staged in the temporary array input_data)
260 if(!(one_hot_grant == 0)) {
261 data_out_tmp[out] = input_data[source_local].data;
262 valid_out_tmp[out] = true;
263 source_tmp[out] = source_local;
264 }
265 }
266 }
267#pragma hls_unroll yes
268 for(unsigned int k=0; k < NumOutputs; k++) {
269 data_out[k] = data_out_tmp[k];
270 source[k] = source_tmp[k];
271 valid_out[k] = valid_out_tmp[k];
272 }
273#pragma hls_unroll yes
274 for(unsigned int k=0; k < NumInputs; k++) {
275 input_consumed[k] = input_consumed_tmp[k];
276 }
277 } // end xbar() function
278
279 public:
280 bool isAllInputEmpty() {
281 bool fifo_empty_internal[NumInputs + 1];
282 fifo_empty_internal[0] = true;
283#pragma hls_unroll yes
284 for (unsigned i = 0; i < NumInputs; i++) {
285 fifo_empty_internal[i + 1] = (isInputEmpty(i)) & fifo_empty_internal[i];
286 }
287 return fifo_empty_internal[NumInputs];
288 }
289
290 bool isAllOutputEmpty() {
291 bool fifo_empty_internal[NumOutputs + 1];
292 fifo_empty_internal[0] = true;
293#pragma hls_unroll yes
294 for (unsigned i = 0; i < NumOutputs; i++) {
295 fifo_empty_internal[i + 1] = (isOutputEmpty(i)) & fifo_empty_internal[i];
296 }
297 return fifo_empty_internal[NumOutputs];
298 }
299
300 bool isAllInputReady() {
301 bool fifo_ready_internal[NumInputs + 1];
302 fifo_ready_internal[0] = true;
303#pragma hls_unroll yes
304 for (unsigned i = 0; i < NumInputs; i++) {
305 fifo_ready_internal[i + 1] = (!isInputFull(i)) & fifo_ready_internal[i];
306 }
307 return fifo_ready_internal[NumInputs];
308 }
309
310 // Pop the data from all selected output lanes, data is already got from peek
311 void pop_all_lanes(bool valid_out[NumOutputs]) {
312#pragma hls_unroll yes
313 for (unsigned i = 0; i < NumOutputs; i++) {
314 if (valid_out[i]) {
315 output_queues.pop(i);
316 }
317 }
318 return;
319 }
325 // Top-Level function for Arbitrated Crossbar that returns source
326 // Interface description
327 // data_in - Array of inputs containing data
328 // dest_in - Array of inputs containing destination information for each input
329 // valid_in - Array of inputs indicating if the input is valid
330 // data_out - Array of outputs containing data
331 // valid_out - Array of outputs indicating if the output is valid
332 // ready - Array of outputs indicating if an input was ready. This also
333 // indicates of the input was successfully received by arbitrated Xbar
334 void run(DataType data_in[NumInputs], OutputIdx dest_in[NumInputs],
335 bool valid_in[NumInputs], DataType data_out[NumOutputs],
336 bool valid_out[NumOutputs], bool ready[NumInputs], InputIdx source[NumOutputs]
337 ) {
338 // Need to read data into temporary variables to avoid scheduling problem in
339 // Catapult
340 OutputIdx destin_tmp[NumInputs];
341 bool valid_in_tmp[NumInputs];
342
343#pragma hls_unroll yes
344 for (unsigned in = 0; in < NumInputs; in++) {
345 destin_tmp[in] = dest_in[in];
346 valid_in_tmp[in] = valid_in[in];
347 }
348
349 DataDest input_data[NumInputs];
350 bool input_valid[NumInputs];
351 bool input_consumed[NumInputs];
352#pragma hls_unroll yes
353 for (unsigned i = 0; i < NumInputs; i++) {
354 input_data[i] = BitsToType<DataDest>(0);
355 }
356 DataType output_data[NumOutputs];
357 bool output_valid[NumOutputs];
358 bool output_ready[NumOutputs];
359#pragma hls_unroll yes
360 for (unsigned i = 0; i < NumOutputs; i++) {
361 output_data[i] = BitsToType<DataType>(0);
362 }
363
364 if (LenInputBuffer > 0) {
365// If lane is ready and input data is valid, write to it
366#pragma hls_unroll yes
367 for (unsigned in = 0; in < NumInputs; in++) {
368 ready[in] = !isInputFull(in) || !valid_in_tmp[in];
369 if (!isInputFull(in) & valid_in_tmp[in]) {
370 // push(datain_tmp[in], in, destin_tmp[in]);
371 push(data_in[in], in, destin_tmp[in]);
372 }
373 input_valid[in] = !isInputEmpty(in);
374 if (input_valid[in]) {
375 #ifndef SKIP_LV2TYPE
376 input_data[in] = input_queues.peek(in);
377 #else
378 input_data[in].data_dest = input_queues.peek(in);
379 input_data[in].extract_data_dest();
380 #endif
381 }
382 }
383
384 } else {
385#pragma hls_unroll yes
386 for (unsigned in = 0; in < NumInputs; in++) {
387 input_data[in].data = data_in[in];
388 input_data[in].dest = dest_in[in];
389 input_valid[in] = valid_in_tmp[in];
390 }
391 }
392 for (unsigned in = 0; in < NumInputs; in++) {
393 //DCOUT("DUT - Input: " << in << "\t valid: " << input_valid[in] << "\t dest: " << input_data[in].dest << "\t data: " << input_data[in].data << endl);
394 }
395 if (LenOutputBuffer > 0) {
396#pragma hls_unroll yes
397 for (unsigned out = 0; out < NumOutputs; out++) {
398 output_ready[out] = !isOutputFull(out);
399 }
400 } else {
401#pragma hls_unroll yes
402 for (unsigned out = 0; out < NumOutputs; out++) {
403 output_ready[out] = true;
404 }
405 }
406
407 // Process the XBAR and arbiters
408 xbar(input_data, input_valid, input_consumed, output_data, output_valid,
409 output_ready, source);
410 for (unsigned out = 0; out < NumOutputs; out++) {
411 //DCOUT("DUT - Output: " << out << "\t valid: " << output_valid[out] << "\t data: " << output_data[out] << "\tReady: " << output_ready[out] << endl);
412 }
413
414 if (LenInputBuffer > 0) {
415// Increment the head pointer of the input queues if their requests were granted
416// Do this in a single loop over input lanes so it's clear to the tool that each
417// input channel
418// is only popped once
419#pragma hls_unroll yes
420 for (unsigned in = 0; in < NumInputs; in++) {
421 if (input_consumed[in]) {
422 input_queues.incrHead(in);
423 }
424 }
425 } else {
426#pragma hls_unroll yes
427 for (unsigned in = 0; in < NumInputs; in++) {
428 ready[in] = input_consumed[in];
429 }
430 }
431
432 if (LenOutputBuffer > 0) {
433// Read from each output channel if it is not empty
434#pragma hls_unroll yes
435 for (unsigned out = 0; out < NumOutputs; out++) {
436 if (output_valid[out]) {
437 output_queues.push(output_data[out], out);
438 }
439 valid_out[out] = !isOutputEmpty(out);
440 if (!isOutputEmpty(out)) {
441 data_out[out] = peek(out);
442 }
443 /*peek only
444 if (!isOutputEmpty(out)) {
445 data_out[out] = pop(out);
446 }
447 */
448 }
449 for (unsigned out = 0; out < NumOutputs; out++) {
450 //DCOUT("DUT - ArbXbar Output: " << out << "\t valid: " << valid_out[out] << "\t data: " << data_out[out] << endl);
451 }
452 } else {
453#pragma hls_unroll yes
454 for (unsigned out = 0; out < NumOutputs; out++) {
455 data_out[out] = output_data[out];
456 valid_out[out] = output_valid[out];
457 }
458 }
459 } // end run() function
465 void run(DataType data_in[NumInputs], OutputIdx dest_in[NumInputs],
466 bool valid_in[NumInputs], DataType data_out[NumOutputs],
467 bool valid_out[NumOutputs], bool ready[NumInputs]) {
468 InputIdx source[NumOutputs];
469 run(data_in, dest_in,
470 valid_in, data_out,
471 valid_out, ready, source);
472 } // end run() function
473
474
475}; // end ArbitratedCrossbar class
476
477#endif // end ARBITRATED_CROSSBAR_H
A generalized implementation of generic n-way roundrobin arbiter.
Definition Arbiter.h:61
Crossbar with conflict arbitration and input queuing.
Configurable FIFO class.
Definition fifo.h:65
void run(DataType data_in[NumInputs], OutputIdx dest_in[NumInputs], bool valid_in[NumInputs], DataType data_out[NumOutputs], bool valid_out[NumOutputs], bool ready[NumInputs], InputIdx source[NumOutputs])
Top-Level function for Arbitrated Crossbar.
void run(DataType data_in[NumInputs], OutputIdx dest_in[NumInputs], bool valid_in[NumInputs], DataType data_out[NumOutputs], bool valid_out[NumOutputs], bool ready[NumInputs])
Top-Level function for Arbitrated Crossbar that does not return source.
#define NVHLS_ASSERT_MSG(X, MSG)
#define NVUINTW(width)
Definition nvhls_types.h:35
Compute index width of a constant.
Definition nvhls_int.h:285