MatchLib
arbitrated_crossbar.h
1 /*
2  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License")
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef ARBITRATED_CROSSBAR_H
17 #define ARBITRATED_CROSSBAR_H
18 
19 #include <nvhls_types.h>
20 #include <fifo.h>
21 #include <Arbiter.h>
22 #include <one_hot_to_bin.h>
23 #include <hls_globals.h>
24 #include <nvhls_marshaller.h>
25 #include <nvhls_message.h>
26 
27 // Need to add in virtual output queueing at the input. Can replace the input
28 // FIFOs with a Queue class which internally has multiple parallel FIFOs and
29 // keeps track of where to put each new input
64 template <typename DataType, unsigned int NumInputs, unsigned int NumOutputs,
65  unsigned int LenInputBuffer, unsigned int LenOutputBuffer>
67 
68  public:
69  // Define int types for input and output indices
70  static const int log2_inputs = nvhls::index_width<NumInputs>::val;
71  static const int log2_outputs = nvhls::index_width<NumOutputs>::val;
72 
73  typedef NVUINTW(log2_inputs) InputIdx;
74  typedef NVUINTW(log2_outputs) OutputIdx;
75  #ifdef SKIP_LV2TYPE
76  typedef NVUINTW(Wrapped<DataType>::width + log2_outputs) DataDestType;
77  #endif
78 
79  private:
80  // Convenience class which stores a data and destination
81  // This is what is stored in the input FIFOs
82  class DataDest : public nvhls_message {
83  public:
84  DataType data;
85  OutputIdx dest;
86  static const int width = Wrapped<DataType>::width + log2_outputs;
87  #ifdef SKIP_LV2TYPE
88  DataDestType data_dest;
89 
90  void update_data_dest() {
91  data_dest = static_cast<DataDestType> (data) + (static_cast<DataDestType> (dest) << Wrapped<DataType>::width);
92  }
93 
94  void extract_data_dest() {
95  data = static_cast<DataType> (data_dest);
96  dest = static_cast<OutputIdx> (data_dest >> Wrapped<DataType>::width);
97  }
98  #endif
99  template <unsigned int Size>
100  void Marshall(Marshaller<Size>& m) {
101  #ifdef SKIP_LV2TYPE
102  m& data_dest;
103  #else
104  m& data;
105  m& dest;
106  #endif
107  }
108  };
109 
110  // Input + output FIFOs, arbiters
111  #ifndef SKIP_LV2TYPE
113  #else
115  #endif
117 
118  Arbiter<NumInputs> arbiters[NumOutputs];
119 
120  public:
121  ArbitratedCrossbar() { reset(); }
122 
123  // reset function: reset all the queues
124  void reset() {
125 #pragma hls_unroll yes
126  for (unsigned in = 0; in < NumInputs; in++) {
127  input_queues.reset();
128  }
129 #pragma hls_unroll yes
130  for (unsigned out = 0; out < NumOutputs; out++) {
131  output_queues.reset();
132  arbiters[out].reset();
133  }
134  }
135 
136  // The next few functions report status of a given input or output lane
137  bool isInputEmpty(InputIdx index) {
138  NVHLS_ASSERT_MSG(index <= NumInputs, "Input index greater than number of inputs");
139  return input_queues.isEmpty(index);
140  }
141 
142  bool isOutputEmpty(OutputIdx index) {
143  NVHLS_ASSERT_MSG(index <= NumOutputs, "Output index greater than number of outputs");
144  return output_queues.isEmpty(index);
145  }
146 
147  bool isInputFull(InputIdx index) {
148  NVHLS_ASSERT_MSG(index <= NumInputs, "Input index greater than number of inputs");
149  return input_queues.isFull(index);
150  }
151 
152  bool isOutputFull(OutputIdx index) {
153  NVHLS_ASSERT_MSG(index <= NumOutputs, "Output index greater than number of outputs");
154  return output_queues.isFull(index);
155  }
156 
157  // Add data to a specified input lane, with a specified destination lane
158  void push(DataType data, InputIdx src, OutputIdx dest) {
159  DataDest tmp;
160  tmp.data = data;
161  tmp.dest = dest;
162  #ifndef SKIP_LV2TYPE
163  input_queues.push(tmp, src);
164  #else
165  tmp.update_data_dest();
166  DataDestType tmp_data_dest;
167  tmp_data_dest = tmp.data_dest;
168  input_queues.push(tmp_data_dest, src);
169  #endif
170  }
171 
172  DataType peek(OutputIdx index) { return output_queues.peek(index); }
173 
174  // Pop the data from a specified output lane
175  DataType pop(OutputIdx index) { return output_queues.pop(index); }
176 
177  // Run the crossbar (not the queues)
178  void xbar(DataDest input_data[NumInputs], bool input_valid[NumInputs],
179  bool input_consumed[NumInputs], DataType data_out[NumOutputs],
180  bool valid_out[NumOutputs], bool output_ready[NumOutputs], InputIdx source[NumOutputs]) {
181 
182  // For each input lane, read the data at the head of the queue, and store it
183  // in a temporary array
184  // Also check the destination requested and form a request matrix (along
185  // with its transpose)
186  // Doing this facilitates writing it in one dimension and reading it in
187  // another
188  NVUINTW(NumInputs) requests[NumOutputs]; // requests for all outputs
189 
190  NVUINTW(NumOutputs) requests_transpose[NumInputs];
191 
192 #pragma hls_unroll yes
193  for (unsigned in = 0; in < NumInputs; in++) {
194  // The request from one input lane is just a one-hot representation of its
195  // destination field
196  // but should be 0 if this input lane is empty
197  NVUINTW(NumOutputs) empty = input_valid[in];
198  empty <<= input_data[in].dest;
199  requests_transpose[in] = empty;
200  }
201 
202 // Form transpose request matrix
203 #pragma hls_unroll yes
204  for (unsigned out = 0; out < NumOutputs; out++) {
205 #pragma hls_unroll yes
206  for (unsigned in = 0; in < NumInputs; in++) {
207  requests[out][in] = requests_transpose[in][out];
208  }
209  }
210 
211 // Keep track of which input queues need to be popped
212 // This signal is the OR of all grant bits sent by output lanes to this
213 // input lane
214 // Need to do this so that all queues are popped once at the end in a input
215 // lane loop
216 // This removes a bottleneck in HLS where it was inferring a dependency
217 // among pops
218 #pragma hls_unroll yes
219  for (unsigned in = 0; in < NumInputs; in++) {
220  input_consumed[in] = false;
221  }
222 
223 // Loop over output lanes: run arbiter, then resolve contention
224 #pragma hls_unroll yes
225  for (unsigned out = 0; out < NumOutputs; out++) {
226  valid_out[out] = false;
227 
228  NVUINTW(NumInputs) one_hot_grant = 0;
229  InputIdx source_local;
230 
231  // Stall the arbiters and the crossbar if the output is full
232  // This is also needed to get any pipelining (otherwise the tool will
233  // infer that you want to write in a single cycle)
234  // For some reason separating these two if statements gives better results
235  if (output_ready[out]) {
236 
237  // Run through the Arbiter pick() function, convert to binary
238  one_hot_grant = arbiters[out].pick(requests[out]);
239  one_hot_to_bin<NumInputs, log2_inputs>(one_hot_grant, source_local);
240  }
241 
242 // Grant logic on input queues (OR gate)
243 #pragma hls_unroll
244  for (unsigned in = 0; in < NumInputs; in++) {
245  // pop_inputs[in] = pop_inputs[in] | (one_hot_grant[in] == 1);
246  input_consumed[in] = input_consumed[in] | (one_hot_grant[in] == 1);
247  }
248 
249  // XBAR (using the data that was staged in the temporary array input_data)
250  if ((!(one_hot_grant == 0)) && (output_ready[out])) {
251  data_out[out] = input_data[source_local].data;
252  valid_out[out] = true;
253  source[out] = source_local;
254  }
255  }
256  } // end xbar() function
257 
258  public:
259  bool isAllInputEmpty() {
260  bool fifo_empty_internal[NumInputs + 1];
261  fifo_empty_internal[0] = true;
262 #pragma hls_unroll yes
263  for (unsigned i = 0; i < NumInputs; i++) {
264  fifo_empty_internal[i + 1] = (isInputEmpty(i)) & fifo_empty_internal[i];
265  }
266  return fifo_empty_internal[NumInputs];
267  }
268 
269  bool isAllOutputEmpty() {
270  bool fifo_empty_internal[NumOutputs + 1];
271  fifo_empty_internal[0] = true;
272 #pragma hls_unroll yes
273  for (unsigned i = 0; i < NumOutputs; i++) {
274  fifo_empty_internal[i + 1] = (isOutputEmpty(i)) & fifo_empty_internal[i];
275  }
276  return fifo_empty_internal[NumOutputs];
277  }
278 
279  bool isAllInputReady() {
280  bool fifo_ready_internal[NumInputs + 1];
281  fifo_ready_internal[0] = true;
282 #pragma hls_unroll yes
283  for (unsigned i = 0; i < NumInputs; i++) {
284  fifo_ready_internal[i + 1] = (!isInputFull(i)) & fifo_ready_internal[i];
285  }
286  return fifo_ready_internal[NumInputs];
287  }
288 
289  // Pop the data from all selected output lanes, data is already got from peek
290  void pop_all_lanes(bool valid_out[NumOutputs]) {
291 #pragma hls_unroll yes
292  for (unsigned i = 0; i < NumOutputs; i++) {
293  if (valid_out[i]) {
294  output_queues.pop(i);
295  }
296  }
297  return;
298  }
304  // Top-Level function for Arbitrated Crossbar that returns source
305  // Interface description
306  // data_in - Array of inputs containing data
307  // dest_in - Array of inputs containing destination information for each input
308  // valid_in - Array of inputs indicating if the input is valid
309  // data_out - Array of outputs containing data
310  // valid_out - Array of outputs indicating if the output is valid
311  // ready - Array of outputs indicating if an input was ready. This also
312  // indicates of the input was successfully received by arbitrated Xbar
313  void run(DataType data_in[NumInputs], OutputIdx dest_in[NumInputs],
314  bool valid_in[NumInputs], DataType data_out[NumOutputs],
315  bool valid_out[NumOutputs], bool ready[NumInputs], InputIdx source[NumOutputs]
316  ) {
317  // Need to read data into temporary variables to avoid scheduling problem in
318  // Catapult
319  OutputIdx destin_tmp[NumInputs];
320  bool valid_in_tmp[NumInputs];
321 
322 #pragma hls_unroll yes
323  for (unsigned in = 0; in < NumInputs; in++) {
324  destin_tmp[in] = dest_in[in];
325  valid_in_tmp[in] = valid_in[in];
326  }
327 
328  DataDest input_data[NumInputs];
329  bool input_valid[NumInputs];
330  bool input_consumed[NumInputs];
331 #pragma hls_unroll yes
332  for (unsigned i = 0; i < NumInputs; i++) {
333  input_data[i] = BitsToType<DataDest>(0);
334  }
335  DataType output_data[NumOutputs];
336  bool output_valid[NumOutputs];
337  bool output_ready[NumOutputs];
338 #pragma hls_unroll yes
339  for (unsigned i = 0; i < NumOutputs; i++) {
340  output_data[i] = BitsToType<DataType>(0);
341  }
342 
343  if (LenInputBuffer > 0) {
344 // If lane is ready and input data is valid, write to it
345 #pragma hls_unroll yes
346  for (unsigned in = 0; in < NumInputs; in++) {
347  ready[in] = !isInputFull(in) || !valid_in_tmp[in];
348  if (!isInputFull(in) & valid_in_tmp[in]) {
349  // push(datain_tmp[in], in, destin_tmp[in]);
350  push(data_in[in], in, destin_tmp[in]);
351  }
352  input_valid[in] = !isInputEmpty(in);
353  if (input_valid[in]) {
354  #ifndef SKIP_LV2TYPE
355  input_data[in] = input_queues.peek(in);
356  #else
357  input_data[in].data_dest = input_queues.peek(in);
358  input_data[in].extract_data_dest();
359  #endif
360  }
361  }
362 
363  } else {
364 #pragma hls_unroll yes
365  for (unsigned in = 0; in < NumInputs; in++) {
366  input_data[in].data = data_in[in];
367  input_data[in].dest = dest_in[in];
368  input_valid[in] = valid_in_tmp[in];
369  }
370  }
371  for (unsigned in = 0; in < NumInputs; in++) {
372  //DCOUT("DUT - Input: " << in << "\t valid: " << input_valid[in] << "\t dest: " << input_data[in].dest << "\t data: " << input_data[in].data << endl);
373  }
374  if (LenOutputBuffer > 0) {
375 #pragma hls_unroll yes
376  for (unsigned out = 0; out < NumOutputs; out++) {
377  output_ready[out] = !isOutputFull(out);
378  }
379  } else {
380 #pragma hls_unroll yes
381  for (unsigned out = 0; out < NumOutputs; out++) {
382  output_ready[out] = true;
383  }
384  }
385 
386  // Process the XBAR and arbiters
387  xbar(input_data, input_valid, input_consumed, output_data, output_valid,
388  output_ready, source);
389  for (unsigned out = 0; out < NumOutputs; out++) {
390  //DCOUT("DUT - Output: " << out << "\t valid: " << output_valid[out] << "\t data: " << output_data[out] << "\tReady: " << output_ready[out] << endl);
391  }
392 
393  if (LenInputBuffer > 0) {
394 // Increment the head pointer of the input queues if their requests were granted
395 // Do this in a single loop over input lanes so it's clear to the tool that each
396 // input channel
397 // is only popped once
398 #pragma hls_unroll yes
399  for (unsigned in = 0; in < NumInputs; in++) {
400  if (input_consumed[in]) {
401  input_queues.incrHead(in);
402  }
403  }
404  } else {
405 #pragma hls_unroll yes
406  for (unsigned in = 0; in < NumInputs; in++) {
407  ready[in] = input_consumed[in];
408  }
409  }
410 
411  if (LenOutputBuffer > 0) {
412 // Read from each output channel if it is not empty
413 #pragma hls_unroll yes
414  for (unsigned out = 0; out < NumOutputs; out++) {
415  if (output_valid[out]) {
416  output_queues.push(output_data[out], out);
417  }
418  valid_out[out] = !isOutputEmpty(out);
419  if (!isOutputEmpty(out)) {
420  data_out[out] = peek(out);
421  }
422  /*peek only
423  if (!isOutputEmpty(out)) {
424  data_out[out] = pop(out);
425  }
426  */
427  }
428  for (unsigned out = 0; out < NumOutputs; out++) {
429  //DCOUT("DUT - ArbXbar Output: " << out << "\t valid: " << valid_out[out] << "\t data: " << data_out[out] << endl);
430  }
431  } else {
432 #pragma hls_unroll yes
433  for (unsigned out = 0; out < NumOutputs; out++) {
434  data_out[out] = output_data[out];
435  valid_out[out] = output_valid[out];
436  }
437  }
438  } // end run() function
444  void run(DataType data_in[NumInputs], OutputIdx dest_in[NumInputs],
445  bool valid_in[NumInputs], DataType data_out[NumOutputs],
446  bool valid_out[NumOutputs], bool ready[NumInputs]) {
447  InputIdx source[NumOutputs];
448  run(data_in, dest_in,
449  valid_in, data_out,
450  valid_out, ready, source);
451  } // end run() function
452 
453 
454 }; // end ArbitratedCrossbar class
455 
456 #endif // end ARBITRATED_CROSSBAR_H
Compute index width of a constant.
Definition: nvhls_int.h:285
void run(DataType data_in[NumInputs], OutputIdx dest_in[NumInputs], bool valid_in[NumInputs], DataType data_out[NumOutputs], bool valid_out[NumOutputs], bool ready[NumInputs], InputIdx source[NumOutputs])
Top-Level function for Arbitrated Crossbar.
#define NVHLS_ASSERT_MSG(X, MSG)
Definition: nvhls_assert.h:116
Crossbar with conflict arbitration and input queuing.
#define NVUINTW(width)
Definition: nvhls_types.h:35
void run(DataType data_in[NumInputs], OutputIdx dest_in[NumInputs], bool valid_in[NumInputs], DataType data_out[NumOutputs], bool valid_out[NumOutputs], bool ready[NumInputs])
Top-Level function for Arbitrated Crossbar that does not return source.