Source code for sionna.fec.polar.decoding

#
# SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
"""Layers for Polar decoding such as successive cancellation (SC), successive
cancellation list (SCL) and iterative belief propagation (BP) decoding."""

import tensorflow as tf
import numpy as np
from numpy.core.numerictypes import issubdtype
import warnings
from tensorflow.keras.layers import Layer
from sionna.fec.crc import CRCDecoder, CRCEncoder
from sionna.fec.polar.encoding import Polar5GEncoder
import numbers

[docs]class PolarSCDecoder(Layer):
    """PolarSCDecoder(frozen_pos, n, output_dtype=tf.float32, **kwargs)

    Successive cancellation (SC) decoder [Arikan_Polar]_ for Polar codes and
    Polar-like codes.

    The class inherits from the Keras layer class and can be used as layer in a
    Keras model.

    Parameters
    ----------
        frozen_pos: ndarray
            Array of `int` defining the ``n-k`` indices of the frozen positions.

        n: int
            Defining the codeword length.

       output_dtype: tf.DType
        Defaults to tf.float32. Defines the output datatype of the layer
        (internal precision remains tf.float32).

    Input
    -----
        inputs: [...,n], tf.float32
            2+D tensor containing the channel LLR values (as logits).

    Output
    ------
        : [...,k], tf.float32
            2+D tensor  containing hard-decided estimations of all ``k``
            information bits.

    Raises
    ------
        AssertionError
            If ``n`` is not `int`.

        AssertionError
            If ``n`` is not a power of 2.

        AssertionError
            If the number of elements in ``frozen_pos`` is greater than ``n``.

        AssertionError
            If ``frozen_pos`` does not consists of `int`.

        ValueError
            If ``output_dtype`` is not {tf.float16, tf.float32, tf.float64}.

    Note
    ----
        This layer implements the SC decoder as described in
        [Arikan_Polar]_. However, the implementation follows the `recursive
        tree` [Gross_Fast_SCL]_ terminology and combines nodes for increased
        throughputs without changing the outcome of the algorithm.

        As commonly done, we assume frozen bits are set to `0`. Please note
        that - although its practical relevance is only little - setting frozen
        bits to `1` may result in `affine` codes instead of linear code as the
        `all-zero` codeword is not necessarily part of the code any more.

    """

    def __init__(self, frozen_pos, n, output_dtype=tf.float32, **kwargs):

        if output_dtype not in (tf.float16, tf.float32, tf.float64):
            raise ValueError(
                'output_dtype must be {tf.float16, tf.float32, tf.float64}.')

        if output_dtype is not tf.float32:
            print('Note: decoder uses tf.float32 for internal calculations.')

        super().__init__(dtype=output_dtype, **kwargs)
        self._output_dtype = output_dtype

        # assert error if r>1 or k, n are negativ
        assert isinstance(n, numbers.Number), "n must be a number."
        n = int(n) # n can be float (e.g. as result of n=k*r)

        assert issubdtype(frozen_pos.dtype, int), "frozen_pos contains non int."
        assert len(frozen_pos)<=n, "Num. of elements in frozen_pos cannot " \
            "be greater than n."
        assert np.log2(n)==int(np.log2(n)), "n must be a power of 2."

        # store internal attributes
        self._n = n
        self._frozen_pos = frozen_pos
        self._k = self._n - len(self._frozen_pos)
        self._info_pos = np.setdiff1d(np.arange(self._n), self._frozen_pos)
        assert self._k==len(self._info_pos), "Internal error: invalid " \
                                              "info_pos generated."
        self._llr_max = 30. # internal max LLR value (uncritical for SC dec)
        # and create a frozen bit vector for simpler encoding
        self._frozen_ind = np.zeros(self._n)
        self._frozen_ind[self._frozen_pos] = 1

        # enable graph pruning
        self._use_fast_sc = False

    #########################################
    # Public methods and properties
    #########################################

    @property
    def n(self):
        """Codeword length."""
        return self._n

    @property
    def k(self):
        """Number of information bits."""
        return self._k

    @property
    def frozen_pos(self):
        """Frozen positions for Polar decoding."""
        return self._frozen_pos

    @property
    def info_pos(self):
        """Information bit positions for Polar encoding."""
        return self._info_pos

    @property
    def llr_max(self):
        """Maximum LLR value for internal calculations."""
        return self._llr_max

    @property
    def output_dtype(self):
        """Output dtype of decoder."""
        return self._output_dtype

    #########################
    # Utility methods
    #########################

    def _cn_op_tf(self, x, y):
        """Check-node update (boxplus) for LLR inputs.

        Operations are performed element-wise.

        See [Stimming_LLR]_ and [Hashemi_SSCL]_ for detailed equations.
        """
        x_in = tf.clip_by_value(x,
                                clip_value_min=-self._llr_max,
                                clip_value_max=self._llr_max)
        y_in = tf.clip_by_value(y,
                                clip_value_min=-self._llr_max,
                                clip_value_max=self._llr_max)

        # avoid division for numerical stability
        llr_out = tf.math.log(1 + tf.math.exp(x_in + y_in))
        llr_out -= tf.math.log(tf.math.exp(x_in) + tf.math.exp(y_in))

        return llr_out

    def _vn_op_tf(self, x, y, u_hat):
        """VN update for LLR inputs."""
        return tf.multiply((1-2*u_hat), x) + y

    def _polar_decode_sc_tf(self, llr_ch, frozen_ind):
        """Recursive SC decoding function.

        Recursively branch decoding tree and split into decoding of `upper`
        and `lower` path until reaching a leaf node.

        The function returns the u_hat decisions at stage `0` and the bit
        decisions of the intermediate stage `s` (i.e., the re-encoded version of
        `u_hat` until the current stage `s`).

        Note:
            This decoder parallelizes over the batch-dimension, i.e., the tree
            is processed for all samples in the batch in parallel. This yields a
            higher throughput, but does not improve the latency.
        """

        # calculate current codeword length
        n = len(frozen_ind)

        # branch if leaf is not reached yet
        if n>1:
            if self._use_fast_sc:
                if np.sum(frozen_ind)==n:
                    #print("rate-0 detected! Length: ", n)
                    u_hat = tf.zeros_like(llr_ch)
                    return u_hat, u_hat

            llr_ch1 = llr_ch[...,0:int(n/2)]
            llr_ch2 = llr_ch[...,int(n/2):]
            frozen_ind1 = frozen_ind[0:int(n/2)]
            frozen_ind2 = frozen_ind[int(n/2):]

            # upper path
            x_llr1_in = self._cn_op_tf(llr_ch1, llr_ch2)

            # and call the decoding function (with upper half)
            u_hat1, u_hat1_up = self._polar_decode_sc_tf(x_llr1_in, frozen_ind1)

            # lower path
            x_llr2_in = self._vn_op_tf(llr_ch1, llr_ch2, u_hat1_up)
            # and call the decoding function again (with lower half)
            u_hat2, u_hat2_up = self._polar_decode_sc_tf(x_llr2_in, frozen_ind2)

            # combine u_hat from both branches
            u_hat = tf.concat([u_hat1, u_hat2], -1)

            # calculate re-encoded version of u_hat at current stage
            # u_hat1_up = tf.math.mod(u_hat1_up + u_hat2_up, 2)
            # combine u_hat via bitwise_xor (more efficient than mod2)
            u_hat1_up_int = tf.cast(u_hat1_up, tf.int8)
            u_hat2_up_int = tf.cast(u_hat2_up, tf.int8)
            u_hat1_up_int = tf.bitwise.bitwise_xor(u_hat1_up_int,
                                                   u_hat2_up_int)
            u_hat1_up = tf.cast(u_hat1_up_int , tf.float32)
            u_hat_up = tf.concat([u_hat1_up, u_hat2_up], -1)

        else: # if leaf is reached perform basic decoding op (=decision)

            if frozen_ind==1: # position is frozen
                u_hat = tf.expand_dims(tf.zeros_like(llr_ch[:,0]), axis=-1)
                u_hat_up = u_hat
            else: # otherwise hard decide
                u_hat = 0.5 * (1. - tf.sign(llr_ch))
                #remove "exact 0 llrs" leading to u_hat=0.5
                u_hat = tf.where(tf.equal(u_hat, 0.5),
                                 tf.ones_like(u_hat),
                                 u_hat)
                u_hat_up = u_hat
        return u_hat, u_hat_up

    #########################
    # Keras layer functions
    #########################

    def build(self, input_shape):
        """Check if shape of input is invalid."""
        assert (input_shape[-1]==self._n), "Invalid input shape."
        assert (len(input_shape)>=2), 'Inputs must have at least 2 dimensions.'

    def call(self, inputs):
        """Successive cancellation (SC) decoding function.

        Performs successive cancellation decoding and returns the estimated
        information bits.

        Args:
            inputs (tf.float32): Tensor of shape `[...,n]` containing the
                channel LLR values (as logits).

        Returns:
            `tf.float32`: Tensor of shape `[...,k]` containing
            hard-decided estimations of all ``k`` information bits.

        Raises:
            ValueError: If ``inputs`` is not of shape `[..., n]`
                or `dtype` is not `tf.float32`.

            InvalidArgumentError: When rank(``inputs``)<2.

        Note:
            This function recursively unrolls the SC decoding tree, thus,
            for larger values of ``n`` building the decoding graph can become
            time consuming.
        """

        tf.debugging.assert_type(inputs, self.dtype, 'Invalid input dtype.')
        # internal calculations still in tf.float32
        inputs = tf.cast(inputs, tf.float32)

        # last dim must be of length n
        tf.debugging.assert_equal(tf.shape(inputs)[-1],
                                  self._n,
                                  "Last input dimension must be of length n.")

        # Reshape inputs to [-1, n]
        tf.debugging.assert_greater(tf.rank(inputs), 1)
        input_shape = inputs.shape
        new_shape = [-1, self._n]
        llr_ch = tf.reshape(inputs, new_shape)

        llr_ch = -1. * llr_ch # logits are converted into "true" llrs

        # and decode
        u_hat_n, _ = self._polar_decode_sc_tf(llr_ch, self._frozen_ind)

        # and recover the k information bit positions
        u_hat = tf.gather(u_hat_n, self._info_pos, axis=1)

        # and reconstruct input shape
        output_shape = input_shape.as_list()
        output_shape[-1] = self.k
        output_shape[0] = -1 # first dim can be dynamic (None)
        u_hat_reshape = tf.reshape(u_hat, output_shape)
        return tf.cast(u_hat_reshape, self._output_dtype)

[docs]class PolarSCLDecoder(Layer):
    # pylint: disable=line-too-long
    """PolarSCLDecoder(frozen_pos, n, list_size=8, crc_degree=None, use_hybrid_sc=False, use_fast_scl=True, cpu_only=False, use_scatter=False, ind_iil_inv=None, return_crc_status=False, output_dtype=tf.float32, **kwargs)

    Successive cancellation list (SCL) decoder [Tal_SCL]_ for Polar codes
    and Polar-like codes.

    The class inherits from the Keras layer class and can be used as layer in a
    Keras model.

    Parameters
    ----------
        frozen_pos: ndarray
            Array of `int` defining the ``n-k`` indices of the frozen positions.

        n: int
            Defining the codeword length.

        list_size: int
            Defaults to 8. Defines the list size of the decoder.

        crc_degree: str
            Defining the CRC polynomial to be used. Can be any value from
            `{CRC24A, CRC24B, CRC24C, CRC16, CRC11, CRC6}`.

        use_hybrid_sc: bool
            Defaults to False. If True, SC decoding is applied and only the
            codewords with invalid CRC are decoded with SCL. This option
            requires an outer CRC specified via ``crc_degree``.
            Remark: hybrid_sc does not support XLA optimization, i.e.,
            `@tf.function(jit_compile=True)`.

        use_fast_scl: bool
            Defaults to True. If True, Tree pruning is used to
            reduce the decoding complexity. The output is equivalent to the
            non-pruned version (besides numerical differences).

        cpu_only: bool
            Defaults to False. If True, `tf.py_function` embedding
            is used and the decoder runs on the CPU. This option is usually
            slower, but also more memory efficient and, in particular,
            recommended for larger blocklengths. Remark: cpu_only does not
            support XLA optimization `@tf.function(jit_compile=True)`.

        use_scatter: bool
            Defaults to False. If True, `tf.tensor_scatter_update` is used for
            tensor updates. This option is usually slower, but more memory
            efficient.

        ind_iil_inv : None or [k+k_crc], int or tf.int
            Defaults to None. If not `None`, the sequence is used as inverse
            input bit interleaver before evaluating the CRC.
            Remark: this only effects the CRC evaluation but the output
            sequence is not permuted.

        return_crc_status: bool
            Defaults to False. If True, the decoder additionally returns the
            CRC status indicating if a codeword was (most likely) correctly
            recovered. This is only available if ``crc_degree`` is not None.

        output_dtype: tf.DType
            Defaults to tf.float32. Defines the output datatype of the layer
            (internal precision remains tf.float32).

    Input
    -----
        inputs: [...,n], tf.float32
            2+D tensor containing the channel LLR values (as logits).

    Output
    ------
        b_hat : [...,k], tf.float32
            2+D tensor containing hard-decided estimations of all `k`
            information bits.

        crc_status : [...], tf.bool
            CRC status indicating if a codeword was (most likely) correctly
            recovered. This is only returned if ``return_crc_status`` is True.
            Note that false positives are possible.

    Raises:
        AssertionError
            If ``n`` is not `int`.

        AssertionError
            If ``n`` is not a power of 2.

        AssertionError
            If the number of elements in ``frozen_pos`` is greater than ``n``.

        AssertionError
            If ``frozen_pos`` does not consists of `int`.

        AssertionError
            If ``list_size`` is not `int`.

        AssertionError
            If ``cpu_only`` is not `bool`.

        AssertionError
            If ``use_scatter`` is not `bool`.

        AssertionError
            If ``use_fast_scl`` is not `bool`.

        AssertionError
            If ``use_hybrid_sc`` is not `bool`.

        AssertionError
            If ``list_size`` is not a power of 2.

        ValueError
            If ``output_dtype`` is not {tf.float16, tf.float32, tf.
            float64}.

        ValueError
            If ``inputs`` is not of shape `[..., n]` or `dtype` is not
            correct.

        InvalidArgumentError
            When rank(``inputs``)<2.

    Note
    ----
        This layer implements the successive cancellation list (SCL) decoder
        as described in [Tal_SCL]_ but uses LLR-based message updates
        [Stimming_LLR]_. The implementation follows the notation from
        [Gross_Fast_SCL]_, [Hashemi_SSCL]_. If option `use_fast_scl` is active
        tree pruning is used and tree nodes are combined if possible (see
        [Hashemi_SSCL]_ for details).

        Implementing SCL decoding as TensorFlow graph is a difficult task that
        requires several design tradeoffs to match the TF constraints while
        maintaining a reasonable throughput. Thus, the decoder minimizes
        the `control flow` as much as possible, leading to a strong memory
        occupation (e.g., due to full path duplication after each decision).
        For longer code lengths, the complexity of the decoding graph becomes
        large and we recommend to use the `CPU_only` option that uses an
        embedded Numpy decoder. Further, this function recursively unrolls the
        SCL decoding tree, thus, for larger values of ``n`` building the
        decoding graph can become time consuming. Please consider the
        ``cpu_only`` option if building the graph takes to long.

        A hybrid SC/SCL decoder as proposed in [Cammerer_Hybrid_SCL]_ (using SC
        instead of BP) can be activated with option ``use_hybrid_sc`` iff an
        outer CRC is available. Please note that the results are not exactly
        SCL performance caused by the false positive rate of the CRC.

        As commonly done, we assume frozen bits are set to `0`. Please note
        that - although its practical relevance is only little - setting frozen
        bits to `1` may result in `affine` codes instead of linear code as the
        `all-zero` codeword is not necessarily part of the code any more.
    """

    def __init__(self,
                 frozen_pos,
                 n,
                 list_size=8,
                 crc_degree=None,
                 use_hybrid_sc=False,
                 use_fast_scl=True,
                 cpu_only=False,
                 use_scatter=False,
                 ind_iil_inv=None,
                 return_crc_status=False,
                 output_dtype=tf.float32,
                 **kwargs):

        if output_dtype not in (tf.float16, tf.float32, tf.float64):
            raise ValueError(
                'output_dtype must be {tf.float16, tf.float32, tf.float64}.')

        if output_dtype is not tf.float32:
            print('Note: decoder uses tf.float32 for internal calculations.')

        super().__init__(dtype=output_dtype, **kwargs)
        self._output_dtype = output_dtype

        # assert error if r>1 or k, n are negative
        assert isinstance(n, numbers.Number), "n must be a number."
        n = int(n) # n can be float (e.g. as result of n=k*r)
        assert isinstance(list_size, int), "list_size must be integer."
        assert isinstance(cpu_only, bool), "cpu_only must be bool."
        assert isinstance(use_scatter, bool), "use_scatter must be bool."
        assert isinstance(use_fast_scl, bool), "use_fast_scl must be bool."
        assert isinstance(use_hybrid_sc, bool), "use_hybrid_sc must be bool."
        assert isinstance(return_crc_status, bool), \
                                            "return_crc_status must be bool."

        assert issubdtype(frozen_pos.dtype, int), "frozen_pos contains non int."
        assert len(frozen_pos)<=n, "Num. of elements in frozen_pos cannot " \
            "be greater than n."
        assert np.log2(n)==int(np.log2(n)), "n must be a power of 2."
        assert np.log2(list_size)==int(np.log2(list_size)), \
                                    "list_size must be a power of 2."

        # CPU mode is recommended for larger values of n
        if n>128 and cpu_only is False and use_hybrid_sc is False:
            warnings.warn("Required resource allocation is large " \
            "for the selected blocklength. Consider option `cpu_only=True`.")

        # CPU mode is recommended for larger values of L
        if list_size>32 and cpu_only is False and use_hybrid_sc is False:
            warnings.warn("Resource allocation is high for the " \
            "selected list_size. Consider option `cpu_only=True`.")

        # internal decoder parameters
        self._use_fast_scl = use_fast_scl # optimize rate-0 and rep nodes
        self._use_scatter = use_scatter # slower but more memory friendly
        self._cpu_only = cpu_only # run numpy decoder
        self._use_hybrid_sc = use_hybrid_sc

        # store internal attributes
        self._n = n
        self._frozen_pos = frozen_pos
        self._k = self._n - len(self._frozen_pos)
        self._list_size = list_size
        self._info_pos = np.setdiff1d(np.arange(self._n), self._frozen_pos)
        self._llr_max = 30. # internal max LLR value (not very critical for SC)
        assert self._k==len(self._info_pos), "Internal error: invalid " \
                                             "info_pos generated."
        # create a frozen bit vector
        self._frozen_ind = np.zeros(self._n)
        self._frozen_ind[self._frozen_pos] = 1
        self._cw_ind = np.arange(self._n)
        self._n_stages = int(np.log2(self._n)) # number of decoding stages

        # init CRC check (if needed)
        if crc_degree is not None:
            self._use_crc = True
            self._crc_decoder = CRCDecoder(CRCEncoder(crc_degree))
            self._k_crc = self._crc_decoder.encoder.crc_length
        else:
            self._use_crc = False
            self._k_crc = 0
        assert self._k>=self._k_crc, "Value of k is too small for \
            given CRC_degree."


        if (crc_degree is None) and return_crc_status:
            self._return_crc_status = False
            raise ValueError("Returning CRC status requires given crc_degree.")
        else:
            self._return_crc_status = return_crc_status


        # store the inverse interleaver patter
        if ind_iil_inv is not None:
            assert (ind_iil_inv.shape[0]==self._k), \
                    "ind_int must be of length k+k_crc."
            self._ind_iil_inv = ind_iil_inv
            self._iil = True
        else:
            self._iil = False

        # use SC decoder first and use numpy-based SCL as "afterburner"
        if self._use_hybrid_sc:
            self._decoder_sc = PolarSCDecoder(frozen_pos, n)
            # Note: CRC required to detect SC success
            if not self._use_crc:
                raise ValueError("Hybrid SC requires outer CRC.")

    #########################################
    # Public methods and properties
    #########################################

    @property
    def n(self):
        """Codeword length."""
        return self._n

    @property
    def k(self):
        """Number of information bits."""
        return self._k

    @property
    def k_crc(self):
        """Number of CRC bits."""
        return self._k_crc

    @property
    def frozen_pos(self):
        """Frozen positions for Polar decoding."""
        return self._frozen_pos

    @property
    def info_pos(self):
        """Information bit positions for Polar encoding."""
        return self._info_pos

    @property
    def llr_max(self):
        """Maximum LLR value for internal calculations."""
        return self._llr_max

    @property
    def list_size(self):
        """List size for SCL decoding."""
        return self._list_size

    @property
    def output_dtype(self):
        """Output dtype of decoder."""
        return self._output_dtype

    #####################################
    # Helper functions for the TF decoder
    #####################################

    def _update_rate0_code(self, msg_pm, msg_uhat, msg_llr, cw_ind):
        """Update rate-0 sub-code (i.e., all frozen) at pos ``cw_ind``.

        See eq. (26) in [Hashemi_SSCL]_.

        Remark: bits are not explicitly set to `0` as ``msg_uhat`` is
        initialized with `0` already.
        """
        n = len(cw_ind)
        stage_ind = int(np.log2(n))

        llr = tf.gather(msg_llr[:, :, stage_ind, :], cw_ind, axis=2)
        llr_in = tf.clip_by_value(llr,
                                  clip_value_min=-self._llr_max,
                                  clip_value_max=self._llr_max)

        # update path metric for complete sub-block of length n
        pm_val = tf.math.softplus(-1.*llr_in)
        msg_pm += tf.reduce_sum(pm_val, axis=-1)

        return msg_pm, msg_uhat, msg_llr

    def _update_rep_code(self, msg_pm, msg_uhat, msg_llr, cw_ind):
        """Update rep. code (i.e., only rightmost bit is non-frozen)
        sub-code at position ``ind_u``.

        See Eq. (31) in [Hashemi_SSCL]_.

        Remark: bits are not explicitly set to `0` as ``msg_uhat`` is
        initialized with `0` already.
        """
        n = len(cw_ind)
        stage_ind = int(np.log2(n))

        # update PM
        llr = tf.gather(msg_llr[:, :, stage_ind, :], cw_ind, axis=2)
        llr_in = tf.clip_by_value(llr,
                                  clip_value_min=-self._llr_max,
                                  clip_value_max=self._llr_max)

        # upper branch has negative llr values (bit is 1)
        llr_low =  llr_in[:, :self._list_size, :]
        llr_up = - llr_in[:, self._list_size:, :]
        llr_pm = tf.concat([llr_low, llr_up], 1)
        pm_val = tf.math.softplus(-1.*llr_pm)
        msg_pm += tf.reduce_sum(pm_val, axis=-1)

        msg_uhat1 = msg_uhat[:, :self._list_size, :, :]
        msg_uhat21 = tf.expand_dims(
                        msg_uhat[:, self._list_size:, stage_ind, :cw_ind[0]],
                        axis=2)

        msg_uhat22= tf.expand_dims(
                        msg_uhat[:, self._list_size:, stage_ind, cw_ind[-1]+1:],
                        axis=2)
        # ones to insert
        msg_ones = tf.ones([tf.shape(msg_uhat)[0], self._list_size, 1, n],
                            tf.float32)

        msg_uhat23 = tf.concat([msg_uhat21, msg_ones, msg_uhat22], 3)
        msg_uhat24_1 = msg_uhat[:, self._list_size:, :stage_ind, :]
        msg_uhat24_2 = msg_uhat[:, self._list_size:, stage_ind+1:, :]

        msg_uhat2 = tf.concat([msg_uhat24_1, msg_uhat23, msg_uhat24_2], 2)
        msg_uhat = tf.concat([msg_uhat1, msg_uhat2], 1)

        # branch last bit and update pm at pos cw_ind[-1]
        msg_uhat = self._update_single_bit([cw_ind[-1]], msg_uhat)
        msg_pm, msg_uhat, msg_llr = self._sort_decoders(msg_pm,
                                                        msg_uhat,
                                                        msg_llr)
        msg_uhat, msg_llr, msg_pm = self._duplicate_paths(msg_uhat,
                                                          msg_llr,
                                                          msg_pm)
        return msg_pm, msg_uhat, msg_llr

    def _update_single_bit(self, ind_u, msg_uhat):
        """Update single bit at position ``ind_u`` for all decoders.

        Remark: bits are not explicitly set to `0` as ``msg_uhat`` is
        initialized with `0` already.

        Remark: Two versions are implemented (throughput vs. graph complexity):
        1.) use tensor_scatter_nd_update
        2.) explicitly split graph and concatenate again
        """
        # position is non-frozen
        if self._frozen_ind[ind_u[0]]==0:

            # msg_uhat[:, ind_up, 0, ind_u] = 1
            if self._use_scatter:
                ind_dec = np.arange(self._list_size, 2*self._list_size, 1)
                ind_stage = np.array([0])

                # transpose such that batch dim can be broadcasted
                msg_uhat_t = tf.transpose(msg_uhat, [1, 3, 2, 0])

                # generate index grid
                ind_u = tf.cast(ind_u, tf.int64)
                grid = tf.meshgrid(ind_dec, ind_u, ind_stage)
                ind = tf.reshape(tf.stack(grid, axis=-1), [-1, 3])

                updates = tf.ones([ind.shape[0], tf.shape(msg_uhat)[0]])
                msg_uhat_s = tf.tensor_scatter_nd_update(msg_uhat_t,
                                                         ind,
                                                         updates)
                # and restore original order
                msg_uhat = tf.transpose(msg_uhat_s, [3, 0, 2, 1])
            else:
                # alternative solution with split/concatenation of graph
                msg_uhat1 = msg_uhat[:, :self._list_size, :, :]
                msg_uhat21 = tf.expand_dims(
                                msg_uhat[:, self._list_size:, 0, :ind_u[0]],
                                axis=2)

                msg_uhat22= tf.expand_dims(
                                msg_uhat[:, self._list_size:, 0, ind_u[0]+1:],
                                axis=2)
                # ones to insert
                msg_ones = tf.ones_like(tf.reshape(
                                msg_uhat[:, self._list_size:, 0, ind_u[0]],
                                [-1, self._list_size, 1, 1]))

                msg_uhat23 = tf.concat([msg_uhat21, msg_ones, msg_uhat22], 3)
                msg_uhat24 = msg_uhat[:, self._list_size:, 1:, :]

                msg_uhat2 = tf.concat([msg_uhat23, msg_uhat24], 2)
                msg_uhat = tf.concat([msg_uhat1, msg_uhat2], 1)

        return msg_uhat

    def _update_pm(self, ind_u, msg_uhat, msg_llr, msg_pm):
        """Update path metric of all decoders after updating bit_pos ``ind_u``.

        We implement (10) from [Stimming_LLR]_.
        """
        u_hat = msg_uhat[:, :, 0, ind_u[0]]
        llr = msg_llr[:, :, 0, ind_u[0]]

        llr_in = tf.clip_by_value(llr,
                                  clip_value_min=-self._llr_max,
                                  clip_value_max=self._llr_max)

        # Numerically more stable implementation of log(1 + exp(-x))
        msg_pm += tf.math.softplus(-tf.multiply((1 - 2*u_hat), llr_in))
        return msg_pm

    def _sort_decoders(self, msg_pm, msg_uhat, msg_llr):
        """Sort decoders according to their path metric."""

        ind = tf.argsort(msg_pm, axis=-1)

        msg_pm = tf.gather(msg_pm, ind, batch_dims=1, axis=None)
        msg_uhat = tf.gather(msg_uhat, ind, batch_dims=1, axis=None)
        msg_llr = tf.gather(msg_llr, ind, batch_dims=1, axis=None)

        return msg_pm, msg_uhat, msg_llr

    def _cn_op(self, x, y):
        """Check-node update (boxplus) for LLR inputs.

        Operations are performed element-wise.

        See [Stimming_LLR]_ and [Hashemi_SSCL]_ for detailed equations.
        """
        x_in = tf.clip_by_value(x,
                                clip_value_min=-self._llr_max,
                                clip_value_max=self._llr_max)
        y_in = tf.clip_by_value(y,
                                clip_value_min=-self._llr_max,
                                clip_value_max=self._llr_max)

        # Avoid division for numerical stability
        # Implements log(1+e^(x+y))
        llr_out = tf.math.softplus((x_in + y_in))
        # Implements log(e^x+e^y)
        llr_out -= tf.math.reduce_logsumexp(tf.stack([x_in, y_in], axis=-1),
                                            axis=-1)

        return llr_out

    def _vn_op(self, x, y, u_hat):
        """Variable node update for LLR inputs.

        Operations are performed element-wise.

        See [Stimming_LLR]_ and [Hashemi_SSCL]_ for detailed equations.
        """
        return tf.multiply((1 - 2*u_hat), x) + y

    def _duplicate_paths(self, msg_uhat, msg_llr, msg_pm):
        """Duplicate paths by copying the upper branch into the lower one.
        """
        msg_uhat = tf.tile(msg_uhat[:, :self._list_size, :, :], [1, 2, 1, 1])
        msg_llr = tf.tile(msg_llr[:, :self._list_size, :, :], [1, 2, 1, 1])
        msg_pm = tf.tile(msg_pm[:, :self._list_size], [1, 2])

        return msg_uhat, msg_llr, msg_pm

    def _update_left_branch(self, msg_llr, stage_ind, cw_ind_left,cw_ind_right):
        """Update messages of left branch.

        Remark: Two versions are implemented (throughput vs. graph complexity):
        1.) use tensor_scatter_nd_update
        2.) explicitly split graph and concatenate again
        """

        llr_left_in = tf.gather(msg_llr[:, :, stage_ind, :],
                                cw_ind_left,
                                axis=2)
        llr_right_in = tf.gather(msg_llr[:, :, stage_ind, :],
                                 cw_ind_right,
                                 axis=2)

        llr_left_out = self._cn_op(llr_left_in, llr_right_in)

        if self._use_scatter:
            # self.msg_llr[:, :, stage_ind-1, cw_ind_left] = llr_left_out

            # transpose such that batch-dim can be broadcasted
            msg_llr_t = tf.transpose(msg_llr, [2, 3, 1, 0])
            llr_left_out_s = tf.transpose(llr_left_out, [2, 1, 0])

            # generate index grid
            stage_ind = tf.cast(stage_ind, tf.int64)
            cw_ind_left = tf.cast(cw_ind_left, tf.int64)
            grid = tf.meshgrid(stage_ind-1, cw_ind_left)
            ind = tf.reshape(tf.stack(grid, axis=-1), [-1, 2])

            # update values
            msg_llr_s = tf.tensor_scatter_nd_update(msg_llr_t,
                                                    ind,
                                                    llr_left_out_s)

            # and restore original order
            msg_llr = tf.transpose(msg_llr_s, [3, 2, 0, 1])
        else:
            # alternative solution with split/concatenation of graph
            # llr_left = msg_llr[:, :, stage_ind, cw_ind_left]
            llr_left0 = tf.gather(msg_llr[:, :, stage_ind-1, :],
                                  np.arange(0, cw_ind_left[0]),
                                  axis=2)

            llr_right = tf.gather(msg_llr[:, :, stage_ind-1, :],
                                  cw_ind_right,
                                  axis=2)
            llr_right1 = tf.gather(msg_llr[:, :, stage_ind-1, :],
                                   np.arange(cw_ind_right[-1] +1, self._n),
                                   axis=2)

            llr_s = tf.concat([llr_left0,
                               llr_left_out,
                               llr_right,
                               llr_right1], 2)

            llr_s = tf.expand_dims(llr_s, axis=2)

            msg_llr1 = msg_llr[:, :, 0:stage_ind-1, :]
            msg_llr2 = msg_llr[:, :, stage_ind:, :]
            msg_llr = tf.concat([msg_llr1, llr_s, msg_llr2], 2)

        return msg_llr

    def _update_right_branch(self, msg_llr, msg_uhat, stage_ind, cw_ind_left,
                             cw_ind_right):
        """Update messages for right branch.

        Remark: Two versions are implemented (throughput vs. graph complexity):
        1.) use tensor_scatter_nd_update
        2.) explicitly split graph and concatenate again
        """
        u_hat_left_up = tf.gather(msg_uhat[:, :, stage_ind-1, :],
                                  cw_ind_left,
                                  axis=2)

        llr_left_in = tf.gather(msg_llr[:, :, stage_ind, :],
                                cw_ind_left,
                                axis=2)

        llr_right = tf.gather(msg_llr[:, :, stage_ind, :],
                              cw_ind_right,
                              axis=2)

        llr_right_out = self._vn_op(llr_left_in, llr_right, u_hat_left_up)

        if self._use_scatter:
            # transpose such that batch dim can be broadcasted
            msg_llr_t = tf.transpose(msg_llr, [2, 3, 1, 0])
            llr_right_out_s = tf.transpose(llr_right_out, [2, 1, 0])

            # generate index grid
            stage_ind = tf.cast(stage_ind, tf.int64)
            cw_ind_left = tf.cast(cw_ind_right, tf.int64)
            grid = tf.meshgrid(stage_ind-1, cw_ind_right)
            ind = tf.reshape(tf.stack(grid, axis=-1), [-1, 2])

            msg_llr_s = tf.tensor_scatter_nd_update(msg_llr_t,
                                                    ind,
                                                    llr_right_out_s)

            # and restore original order
            msg_llr = tf.transpose(msg_llr_s, [3, 2, 0, 1])
        else:
            # alternative solution with split/concatenation of graph
            # llr_left = msg_llr[:, :, stage_ind, cw_ind_left]
            llr_left0 = tf.gather(msg_llr[:, :, stage_ind-1, :],
                                  np.arange(0, cw_ind_left[0]),
                                  axis=2)
            llr_left = tf.gather(msg_llr[:, :, stage_ind-1, :],
                                 cw_ind_left,
                                 axis=2)
            llr_right1 = tf.gather(msg_llr[:, :, stage_ind-1, :],
                                   np.arange(cw_ind_right[-1]+1, self._n),
                                   axis=2)

            llr_s = tf.concat([llr_left0, llr_left, llr_right_out,llr_right1],2)
            llr_s = tf.expand_dims(llr_s, axis=2)

            msg_llr1 = msg_llr[:, :, 0:stage_ind-1, :]
            msg_llr2 = msg_llr[:, :, stage_ind:, :]

            msg_llr = tf.concat([msg_llr1, llr_s, msg_llr2], 2)

        return msg_llr

    def _update_branch_u(self, msg_uhat, stage_ind, cw_ind_left, cw_ind_right):
        """Update ``u_hat`` messages after executing both branches.

        Remark: Two versions are implemented (throughput vs. graph complexity):
        1.) use tensor_scatter_nd_update
        2.) explicitly split graph and concatenate again
        """
        u_hat_left_up = tf.gather(msg_uhat[:, :, stage_ind-1, :],
                                  cw_ind_left,
                                  axis=2)

        u_hat_right_up = tf.gather(msg_uhat[:, :, stage_ind-1, :],
                                   cw_ind_right,
                                   axis=2)

        # combine u_hat via bitwise_xor (more efficient than mod2)
        u_hat_left_up_int = tf.cast(u_hat_left_up, tf.int32)
        u_hat_right_up_int = tf.cast(u_hat_right_up, tf.int32)
        u_hat_left = tf.bitwise.bitwise_xor(u_hat_left_up_int,
                                            u_hat_right_up_int)
        u_hat_left = tf.cast(u_hat_left, tf.float32)

        if self._use_scatter:
            cw_ind = np.concatenate([cw_ind_left, cw_ind_right])

            u_hat = tf.concat([u_hat_left, u_hat_right_up], -1)

            # self.msg_llr[:, stage_ind-1, cw_ind_left] = llr_left_out

            # transpose such that batch dim can be broadcasted
            msg_uhat_t = tf.transpose(msg_uhat, [2, 3, 1, 0])
            u_hat_s = tf.transpose(u_hat, [2, 1, 0])

            # generate index grid
            stage_ind = tf.cast(stage_ind, tf.int64)
            cw_ind = tf.cast(cw_ind, tf.int64)
            grid = tf.meshgrid(stage_ind, cw_ind)
            ind = tf.reshape(tf.stack(grid, axis=-1), [-1, 2])

            msg_uhat_s = tf.tensor_scatter_nd_update(msg_uhat_t,
                                                     ind,
                                                     u_hat_s)

            # and restore original order
            msg_uhat = tf.transpose(msg_uhat_s, [3, 2, 0, 1])
        else:
            # alternative solution with split/concatenation of graph
            u_hat_left_0 = tf.gather(msg_uhat[:, :, stage_ind, :],
                                     np.arange(0, cw_ind_left[0]),
                                     axis=2)
            u_hat_right_1 = tf.gather(msg_uhat[:, :, stage_ind, :],
                                      np.arange(cw_ind_right[-1]+1, self._n),
                                      axis=2)

            u_hat = tf.concat([u_hat_left_0,
                               u_hat_left,
                               u_hat_right_up,
                               u_hat_right_1], 2)

            # provide u_hat for next higher stage
            msg_uhat1 = msg_uhat[:, :, 0:stage_ind, :]
            msg_uhat2 = msg_uhat[:, :, stage_ind+1:, :]
            u_hat = tf.expand_dims(u_hat, axis=2)

            msg_uhat = tf.concat([msg_uhat1, u_hat, msg_uhat2], 2)

        return msg_uhat

    def _polar_decode_scl(self, cw_ind, msg_uhat, msg_llr, msg_pm):
        """Recursive decoding function for SCL decoding.

        We follow the terminology from [Hashemi_SSCL]_ and [Stimming_LLR]_
        and branch the messages into a `left` and `right` update paths until
        reaching a leaf node.

        Tree pruning as proposed in [Hashemi_SSCL]_ is used to minimize the
        tree depth while maintaining the same output.
        """
        # current sub-code length and stage index (= tree depth)
        n = len(cw_ind)
        stage_ind = int(np.log2(n))

        # recursively branch through decoding tree
        if n>1:
            # prune tree if rate-0 subcode is detected
            if self._use_fast_scl:
                if np.sum(self._frozen_ind[cw_ind])==n:
                    msg_pm, msg_uhat, msg_llr = self._update_rate0_code(msg_pm,
                                                                       msg_uhat,
                                                                       msg_llr,
                                                                       cw_ind)
                    return msg_uhat, msg_llr, msg_pm

                if (self._frozen_ind[cw_ind[-1]]==0 and
                    np.sum(self._frozen_ind[cw_ind[:-1]])==n-1):
                    msg_pm, msg_uhat, msg_llr, = self._update_rep_code(msg_pm,
                                                                       msg_uhat,
                                                                       msg_llr,
                                                                       cw_ind)
                    return msg_uhat, msg_llr, msg_pm

            # split index into left and right part
            cw_ind_left = cw_ind[0:int(n/2)]
            cw_ind_right = cw_ind[int(n/2):]

            # ----- left branch -----
            msg_llr = self. _update_left_branch(msg_llr,
                                                stage_ind,
                                                cw_ind_left,
                                                cw_ind_right)

            # call sub-graph decoder of left branch
            msg_uhat, msg_llr, msg_pm = self._polar_decode_scl(cw_ind_left,
                                                               msg_uhat,
                                                               msg_llr,
                                                               msg_pm)

            # ----- right branch -----
            msg_llr = self._update_right_branch(msg_llr,
                                                msg_uhat,
                                                stage_ind,
                                                cw_ind_left,
                                                cw_ind_right)

            # call sub-graph decoder of right branch
            msg_uhat, msg_llr, msg_pm = self._polar_decode_scl(cw_ind_right,
                                                               msg_uhat,
                                                               msg_llr,
                                                               msg_pm)
            # update uhat at current stage
            msg_uhat = self._update_branch_u(msg_uhat,
                                             stage_ind,
                                             cw_ind_left,
                                             cw_ind_right)

        # if leaf is reached perform basic decoding op (=decision)
        else:
            # update bit value at current position
            msg_uhat = self._update_single_bit(cw_ind, msg_uhat)

            # update PM
            msg_pm = self._update_pm(cw_ind, msg_uhat, msg_llr, msg_pm)

            if self._frozen_ind[cw_ind]==0: # position is non-frozen
                # sort list
                msg_pm, msg_uhat, msg_llr = self._sort_decoders(msg_pm,
                                                                msg_uhat,
                                                                msg_llr)

                # duplicate l best decoders to pos l:2*l (kill other decoders)
                msg_uhat, msg_llr, msg_pm = self._duplicate_paths(msg_uhat,
                                                                  msg_llr,
                                                                  msg_pm)

        return msg_uhat, msg_llr, msg_pm

    def _decode_tf(self, llr_ch):
        """Main decoding function in TF.

        Initializes memory and calls recursive decoding function.
        """

        batch_size = tf.shape(llr_ch)[0]

        # allocate memory for all 2*list_size decoders
        msg_uhat = tf.zeros([batch_size,
                             2*self._list_size,
                             self._n_stages+1,
                             self._n])
        msg_llr = tf.zeros([batch_size,
                            2*self._list_size,
                            self._n_stages,
                            self._n])
        # init all 2*l decoders with same llr_ch
        llr_ch = tf.reshape(llr_ch, [-1, 1, 1, self._n])
        llr_ch = tf.tile(llr_ch,[1, 2*self._list_size, 1, 1])

        # init last stage with llr_ch
        msg_llr = tf.concat([msg_llr, llr_ch], 2)

        # init all remaining L-1 decoders with high penalty
        pm0 = tf.zeros([batch_size, 1])
        pm1 = self._llr_max * tf.ones([batch_size, self._list_size-1])
        msg_pm = tf.concat([pm0, pm1, pm0, pm1], 1)

        # and call recursive graph function
        msg_uhat, msg_llr, msg_pm = self._polar_decode_scl(self._cw_ind,
                                                           msg_uhat,
                                                           msg_llr,
                                                           msg_pm)

        # and sort output
        msg_pm, msg_uhat, msg_llr = self._sort_decoders(msg_pm,
                                                        msg_uhat,
                                                        msg_llr)
        return [msg_uhat, msg_pm]

    ####################################
    # Helper functions for Numpy decoder
    ####################################

    def _update_rate0_code_np(self, cw_ind):
        """Update rate-0 (i.e., all frozen) sub-code at pos ``cw_ind`` in Numpy.

        See Eq. (26) in [Hashemi_SSCL]_.
        """
        n = len(cw_ind)
        stage_ind = int(np.log2(n))

        # update PM for each batch sample
        ind = np.expand_dims(self._dec_pointer, axis=-1)
        llr_in = np.take_along_axis(self.msg_llr[:, :, stage_ind, cw_ind],
                                    ind,
                                    axis=1)

        llr_clip = np.maximum(np.minimum(llr_in, self._llr_max), -self._llr_max)
        pm_val = np.log(1 + np.exp(-llr_clip))
        self.msg_pm += np.sum(pm_val, axis=-1)

    def _update_rep_code_np(self, cw_ind):
        """Update rep. code (i.e., only rightmost bit is non-frozen)
        sub-code at position ``ind_u`` in Numpy.

        See Eq. (31) in [Hashemi_SSCL]_.
        """
        n = len(cw_ind)
        stage_ind = int(np.log2(n))
        bs = self._dec_pointer.shape[0]

        # update PM
        llr = np.zeros([bs, 2*self._list_size, n])
        for i in range(bs):
            llr_i = self.msg_llr[i, self._dec_pointer[i, :], stage_ind, :]
            llr[i, :, :] = llr_i[:, cw_ind]

        # upper branch has negative llr values (bit is 1)
        llr[:, self._list_size:, :] = - llr[:, self._list_size:, :]
        llr_in = np.maximum(np.minimum(llr, self._llr_max), -self._llr_max)
        pm_val = np.sum(np.log(1 + np.exp(-llr_in)), axis=-1)
        self.msg_pm += pm_val

        for i in range(bs):
            ind_dec = self._dec_pointer[i, self._list_size:]
            for j in cw_ind:
                self.msg_uhat[i, ind_dec, stage_ind, j] = 1

        # branch last bit and update pm at pos cw_ind[-1]
        self._update_single_bit_np([cw_ind[-1]])
        self._sort_decoders_np()
        self._duplicate_paths_np()

    def _update_single_bit_np(self, ind_u):
        """Update single bit at position ``ind_u`` of all decoders in Numpy."""

        if self._frozen_ind[ind_u]==0: # position is non-frozen
            ind_dec = np.expand_dims(self._dec_pointer[:, self._list_size:],
                                     axis=-1)
            uhat_slice = self.msg_uhat[:, :, 0, ind_u]
            np.put_along_axis(uhat_slice, ind_dec, 1., axis=1)
            self.msg_uhat[:, :, 0, ind_u] = uhat_slice


    def _update_pm_np(self, ind_u):
        """ Update path metric of all decoders at bit position ``ind_u`` in
        Numpy.

        We apply Eq. (10) from [Stimming_LLR]_.
        """
        ind = np.expand_dims(self._dec_pointer, axis=-1)
        u_hat = np.take_along_axis(self.msg_uhat[:, :, 0, ind_u], ind, axis=1)
        u_hat = np.squeeze(u_hat, axis=-1)
        llr_in = np.take_along_axis(self.msg_llr[:, :, 0, ind_u], ind, axis=1)
        llr_in = np.squeeze(llr_in, axis=-1)

        llr_clip = np.maximum(np.minimum(llr_in, self._llr_max), -self._llr_max)
        self.msg_pm += np.log(1 + np.exp(-np.multiply((1-2*u_hat), llr_clip)))

    def _sort_decoders_np(self):
        """Sort decoders according to their path metric."""

        ind = np.argsort(self.msg_pm, axis=-1)
        self.msg_pm = np.take_along_axis(self.msg_pm, ind, axis=1)
        self._dec_pointer = np.take_along_axis(self._dec_pointer, ind, axis=1)

    def _cn_op_np(self, x, y):
        """Check node update (boxplus) for LLRs in Numpy.

        See [Stimming_LLR]_ and [Hashemi_SSCL]_ for detailed equations.
        """
        x_in = np.maximum(np.minimum(x, self._llr_max), -self._llr_max)
        y_in = np.maximum(np.minimum(y, self._llr_max), -self._llr_max)

        # avoid division for numerical stability
        llr_out = np.log(1 + np.exp(x_in + y_in))
        llr_out -= np.log(np.exp(x_in) + np.exp(y_in))

        return llr_out

    def _vn_op_np(self, x, y, u_hat):
        """Variable node update (boxplus) for LLRs in Numpy."""
        return np.multiply((1-2*u_hat), x) + y

    def _duplicate_paths_np(self):
        """Copy first ``list_size``/2 paths into lower part in Numpy.

        Decoder indices are encoded in ``self._dec_pointer``.
        """
        ind_low = self._dec_pointer[:, :self._list_size]
        ind_up = self._dec_pointer[:, self._list_size:]

        for i in range(ind_up.shape[0]):
            self.msg_uhat[i, ind_up[i,:], :, :] = self.msg_uhat[i,
                                                                ind_low[i,:],
                                                                :, :]
            self.msg_llr[i, ind_up[i,:],:,:] = self.msg_llr[i, ind_low[i,:],:,:]

        # pm must be sorted directly (not accessed via pointer)
        self.msg_pm[:, self._list_size:] = self.msg_pm[:, :self._list_size]

    def _polar_decode_scl_np(self, cw_ind):
        """Recursive decoding function in Numpy.

        We follow the terminology from [Hashemi_SSCL]_ and [Stimming_LLR]_
        and branch the messages into a `left` and `right` update paths until
        reaching a leaf node.

        Tree pruning as proposed in [Hashemi_SSCL]_ is used to minimize the
        tree depth while maintaining the same output.
        """
        n = len(cw_ind)
        stage_ind = int(np.log2(n))

        # recursively branch through decoding tree
        if n>1:
            # prune tree if rate-0 subcode or rep-code is detected
            if self._use_fast_scl:
                if np.sum(self._frozen_ind[cw_ind])==n:
                    # rate0 code detected
                    self._update_rate0_code_np(cw_ind)
                    return
                if (self._frozen_ind[cw_ind[-1]]==0 and
                    np.sum(self._frozen_ind[cw_ind[:-1]])==n-1):
                    # rep code detected
                    self._update_rep_code_np(cw_ind)
                    return
            cw_ind_left = cw_ind[0:int(n/2)]
            cw_ind_right = cw_ind[int(n/2):]

            # ----- left branch -----
            llr_left = self.msg_llr[:, :, stage_ind, cw_ind_left]
            llr_right = self.msg_llr[:, :, stage_ind, cw_ind_right]

            self.msg_llr[:, :, stage_ind-1, cw_ind_left] = self._cn_op_np(
                                                                    llr_left,
                                                                    llr_right)

            # call left branch decoder
            self._polar_decode_scl_np(cw_ind_left)

            # ----- right branch -----
            u_hat_left_up = self.msg_uhat[:, :, stage_ind-1, cw_ind_left]
            llr_left = self.msg_llr[:, :, stage_ind, cw_ind_left]
            llr_right = self.msg_llr[:, :, stage_ind, cw_ind_right]

            self.msg_llr[:, :, stage_ind-1, cw_ind_right] = self._vn_op_np(
                                                                llr_left,
                                                                llr_right,
                                                                u_hat_left_up)

            # call right branch decoder
            self._polar_decode_scl_np(cw_ind_right)

            # combine u_hat
            u_hat_left_up = self.msg_uhat[:, :, stage_ind-1, cw_ind_left]
            u_hat_right_up = self.msg_uhat[:, :, stage_ind-1, cw_ind_right]

            # u_hat_left_up XOR u_hat_right_up
            u_hat_left =  (u_hat_left_up != u_hat_right_up) + 0

            u_hat = np.concatenate([u_hat_left, u_hat_right_up], axis=-1)

            # provide u_hat for next higher stage
            self.msg_uhat[:, :, stage_ind,  cw_ind] = u_hat

        else: # if leaf is reached perform basic decoding op (=decision)

            self._update_single_bit_np(cw_ind)

            # update PM
            self._update_pm_np(cw_ind)

            # position is non-frozen
            if self._frozen_ind[cw_ind]==0:
                # sort list
                self._sort_decoders_np()
                # duplicate the best list_size decoders
                self._duplicate_paths_np()
        return

    def _decode_np_batch(self, llr_ch):
        """Decode batch of ``llr_ch`` with Numpy decoder."""

        bs = llr_ch.shape[0]

        # allocate memory for all 2*list_size decoders
        self.msg_uhat = np.zeros([bs,
                                  2*self._list_size,
                                  self._n_stages+1,
                                  self._n])
        self.msg_llr = np.zeros([bs,
                                 2*self._list_size,
                                 self._n_stages+1,
                                 self._n])
        self.msg_pm = np.zeros([bs,
                                2*self._list_size])

        # L-1 decoders start with high penalty
        self.msg_pm[:,1:self._list_size] = self._llr_max
        # same for the second half of the L-1 decoders
        self.msg_pm[:,self._list_size+1:] = self._llr_max

        # use pointers to avoid in-memory sorting
        self._dec_pointer = np.arange(2*self._list_size)
        self._dec_pointer = np.tile(np.expand_dims(self._dec_pointer, axis=0),
                                    [bs,1])

        # init llr_ch (broadcast via list dimension)
        self.msg_llr[:, :, self._n_stages, :] = np.expand_dims(llr_ch, axis=1)

        # call recursive graph function
        self._polar_decode_scl_np(self._cw_ind)

        # select most likely candidate
        self._sort_decoders_np()

        # remove pointers
        for ind in range(bs):
            self.msg_uhat[ind, :, :, :] = self.msg_uhat[ind,
                                                        self._dec_pointer[ind],
                                                        :, :]
        return self.msg_uhat, self.msg_pm

    def _decode_np_hybrid(self, llr_ch, u_hat_sc, crc_valid):
        """Hybrid SCL decoding stage that decodes iff CRC from previous SC
        decoding attempt failed.

        This option avoids the usage of the high-complexity SCL decoder in cases
        where SC would be sufficient. For further details we refer to
        [Cammerer_Hybrid_SCL]_ (we use SC instead of the proposed BP stage).

        Remark: This decoder does not exactly implement SCL as the CRC
        can be false positive after the SC stage. However, in these cases
        SCL+CRC may also yield the wrong results.

        Remark 2: Due to the excessive control flow (if/else) and the
        varying batch-sizes, this function is only available as Numpy
        decoder (i.e., runs on the CPU).
        """

        bs = llr_ch.shape[0]
        crc_valid = np.squeeze(crc_valid, axis=-1)
        # index of codewords that need SCL decoding
        ind_invalid = np.arange(bs)[np.invert(crc_valid)]

        # init SCL decoder for bs_hyb samples requiring SCL dec.
        llr_ch_hyb = np.take(llr_ch, ind_invalid, axis=0)
        msg_uhat_hyb, msg_pm_hyb = self._decode_np_batch(llr_ch_hyb)

        # merge results with previously decoded SC results
        msg_uhat = np.zeros([bs, 2*self._list_size, 1, self._n])
        msg_pm = np.ones([bs, 2*self._list_size]) * self._llr_max * self.k
        msg_pm[:, 0] = 0

        # copy SC data
        msg_uhat[:, 0, 0, self._info_pos] = u_hat_sc

        ind_hyb = 0
        for ind in range(bs):
            if not crc_valid[ind]:
                #copy data from SCL
                msg_uhat[ind, :, 0, :] = msg_uhat_hyb[ind_hyb, :, 0, :]
                msg_pm[ind, :] = msg_pm_hyb[ind_hyb, :]
                ind_hyb += 1

        return msg_uhat, msg_pm

    #########################
    # Keras layer functions
    #########################

    def build(self, input_shape):
        """Build and check if shape of input is invalid."""
        assert (input_shape[-1]==self._n), "Invalid input shape."
        assert (len(input_shape)>=2), 'Inputs must have at least 2 dimensions.'

    def call(self, inputs):
        """Successive cancellation list (SCL) decoding function.

        This function performs successive cancellation list decoding
        and returns the estimated information bits.

        An outer CRC can be applied optionally by setting ``crc_degree``.

        Args:
            inputs (tf.float32): Tensor of shape `[...,n]` containing the
                channel LLR values (as logits).

        Returns:
            `tf.float32`: Tensor of shape `[...,k]` containing
            hard-decided estimations of all ``k`` information bits.

        Raises:
            ValueError: If ``inputs`` is not of shape `[..., n]`
                or `dtype` is not `tf.float32`.

            InvalidArgumentError: When rank(``inputs``)<2.

        Note:
            This function recursively unrolls the SCL decoding tree, thus,
            for larger values of ``n`` building the decoding graph can become
            time consuming. Please consider the ``cpu_only`` option instead.
        """

        tf.debugging.assert_type(inputs, self._output_dtype,
                                 "Invalid input dtype.")
        # internal calculations still in tf.float32
        inputs = tf.cast(inputs, tf.float32)

        # last dim must be of length n
        tf.debugging.assert_equal(tf.shape(inputs)[-1],
                                  self._n,
                                  "Last input dimension must be of length n.")

        # Reshape inputs to [-1, n]
        tf.debugging.assert_greater(tf.rank(inputs), 1)
        input_shape = inputs.shape
        new_shape = [-1, self._n]
        llr_ch = tf.reshape(inputs, new_shape)

        llr_ch = -1. * llr_ch # logits are converted into "true" llrs

        # if activated use Numpy decoder
        if self._use_hybrid_sc:
            # use SC decoder to decode first
            u_hat = self._decoder_sc(-llr_ch)
            _, crc_valid = self._crc_decoder(u_hat)
            msg_uhat, msg_pm = tf.py_function(func=self._decode_np_hybrid,
                                              inp=[llr_ch, u_hat, crc_valid],
                                              Tout=[tf.float32, tf.float32])
            # note: return shape is only 1 in 3. dim (to avoid copy overhead)
            msg_uhat = tf.reshape(msg_uhat, [-1, 2*self._list_size, 1, self._n])
            msg_pm = tf.reshape(msg_pm, [-1, 2*self._list_size])
        else:
            if self._cpu_only:
                msg_uhat, msg_pm = tf.py_function(func=self._decode_np_batch,
                                                  inp=[llr_ch],
                                                  Tout=[tf.float32, tf.float32])
                # restore shape information
                msg_uhat = tf.reshape(msg_uhat,
                            [-1, 2*self._list_size, self._n_stages+1, self._n])
                msg_pm = tf.reshape(msg_pm, [-1, 2*self._list_size])
            else:
                msg_uhat, msg_pm = self._decode_tf(llr_ch)

        # check CRC (and remove CRC parity bits)
        if self._use_crc:
            u_hat_list = tf.gather(msg_uhat[:, :, 0, :],
                                   self._info_pos,
                                   axis=-1)
            # undo input bit interleaving
            # remark: the output is not interleaved for compatibility with SC
            if self._iil:
                u_hat_list_crc = tf.gather(u_hat_list,
                                           self._ind_iil_inv,
                                           axis=-1)
            else: # no interleaving applied
                u_hat_list_crc = u_hat_list

            _, crc_valid = self._crc_decoder(u_hat_list_crc)
            # add penalty to pm if CRC fails
            pm_penalty = ((1. - tf.cast(crc_valid, tf.float32))
                       * self._llr_max * self.k)
            msg_pm += tf.squeeze(pm_penalty, axis=2)

        # select most likely candidate
        cand_ind = tf.argmin(msg_pm, axis=-1)
        c_hat = tf.gather(msg_uhat[:, :, 0, :], cand_ind, axis=1, batch_dims=1)
        u_hat = tf.gather(c_hat, self._info_pos, axis=-1)

        # and reconstruct input shape
        output_shape = input_shape.as_list()
        output_shape[-1] = self.k
        output_shape[0] = -1 # first dim can be dynamic (None)
        u_hat_reshape = tf.reshape(u_hat, output_shape)

        if self._return_crc_status:
            # reconstruct CRC status
            crc_status = tf.gather(crc_valid, cand_ind, axis=1, batch_dims=1)
            # reconstruct shape
            output_shape.pop() # remove last dimension
            crc_status = tf.reshape(crc_status, output_shape)

            crc_status = tf.cast(crc_status, self._output_dtype)
            # return info bits and CRC status
            return tf.cast(u_hat_reshape, self._output_dtype), crc_status
        else: # return only info bits
            return tf.cast(u_hat_reshape, self._output_dtype)


[docs]class PolarBPDecoder(Layer):
    # pylint: disable=line-too-long
    """PolarBPDecoder(frozen_pos, n, num_iter=20, hard_out=True, output_dtype=tf.float32, **kwargs)

    Belief propagation (BP) decoder for Polar codes [Arikan_Polar]_ and
    Polar-like codes based on [Arikan_BP]_ and [Forney_Graphs]_.

    The class inherits from the Keras layer class and can be used as layer in a
    Keras model.

    Remark: The PolarBPDecoder does currently not support XLA.

    Parameters
    ----------
        frozen_pos: ndarray
            Array of `int` defining the ``n-k`` indices of the frozen positions.

        n: int
            Defining the codeword length.

        num_iter: int
            Defining the number of decoder iterations (no early stopping used
            at the moment).

        hard_out: bool
            Defaults to True. If True, the decoder provides hard-decided
            information bits instead of soft-values.

        output_dtype: tf.DType
            Defaults to tf.float32. Defines the output datatype of the layer
            (internal precision remains tf.float32).

    Input
    -----
        inputs: [...,n], tf.float32
            2+D tensor containing the channel logits/llr values.

    Output
    ------
        : [...,k], tf.float32
            2+D tensor containing bit-wise soft-estimates
            (or hard-decided bit-values) of all ``k`` information bits.

    Raises
    ------
        AssertionError
            If ``n`` is not `int`.

        AssertionError
            If ``n`` is not a power of 2.

        AssertionError
            If the number of elements in ``frozen_pos`` is greater than ``n``.

        AssertionError
            If ``frozen_pos`` does not consists of `int`.

        AssertionError
            If ``hard_out`` is not `bool`.

        ValueError
            If ``output_dtype`` is not {tf.float16, tf.float32, tf.float64}.

        AssertionError
            If ``num_iter`` is not `int`.

        AssertionError
            If ``num_iter`` is not a positive value.

    Note
    ----
        This decoder is fully differentiable and, thus, well-suited for
        gradient descent-based learning tasks such as `learned code design`
        [Ebada_Design]_.

        As commonly done, we assume frozen bits are set to `0`. Please note
        that - although its practical relevance is only little - setting frozen
        bits to `1` may result in `affine` codes instead of linear code as the
        `all-zero` codeword is not necessarily part of the code any more.

    """

    def __init__(self,
                 frozen_pos,
                 n,
                 num_iter=20,
                 hard_out=True,
                 output_dtype=tf.float32,
                 **kwargs):

        if output_dtype not in (tf.float16, tf.float32, tf.float64):
            raise ValueError(
                'output_dtype must be {tf.float16, tf.float32, tf.float64}.')

        if output_dtype is not tf.float32:
            print('Note: decoder uses tf.float32 for internal calculations.')

        super().__init__(dtype=output_dtype, **kwargs)
        self._output_dtype = output_dtype

        # assert error if r>1 or k, n are negative
        assert isinstance(n, numbers.Number), "n must be a number."
        n = int(n) # n can be float (e.g. as result of n=k*r)
        assert issubdtype(frozen_pos.dtype, int), "frozen_pos contains non int."
        assert len(frozen_pos)<=n, "Num. of elements in frozen_pos cannot " \
            "be greater than n."
        assert np.log2(n)==int(np.log2(n)), "n must be a power of 2."

        assert isinstance(hard_out, bool), "hard_out must be boolean."

        # store internal attributes
        self._n = n
        self._frozen_pos = frozen_pos
        self._k = self._n - len(self._frozen_pos)
        self._info_pos = np.setdiff1d(np.arange(self._n), self._frozen_pos)
        assert self._k==len(self._info_pos), "Internal error: invalid " \
                                             "info_pos generated."

        assert isinstance(num_iter, int), "num_iter must be integer."
        assert num_iter>0, "num_iter must be a positive value."
        self._num_iter = tf.constant(num_iter, dtype=tf.int32)

        self._llr_max = 19.3 # internal max LLR value
        self._hard_out = hard_out

        # depth of decoding graph
        self._n_stages = int(np.log2(self._n))

    #########################################
    # Public methods and properties
    #########################################

    @property
    def n(self):
        """Codeword length."""
        return self._n

    @property
    def k(self):
        """Number of information bits."""
        return self._k

    @property
    def frozen_pos(self):
        """Frozen positions for Polar decoding."""
        return self._frozen_pos

    @property
    def info_pos(self):
        """Information bit positions for Polar encoding."""
        return self._info_pos

    @property
    def llr_max(self):
        """Maximum LLR value for internal calculations."""
        return self._llr_max

    @property
    def num_iter(self):
        """Number of decoding iterations."""
        return self._num_iter

    @property
    def hard_out(self):
        """Indicates if decoder hard-decides outputs."""
        return self._hard_out

    @property
    def output_dtype(self):
        """Output dtype of decoder."""
        return self._output_dtype

    @num_iter.setter
    def num_iter(self, num_iter):
        "Number of decoding iterations."
        assert isinstance(num_iter, int), 'num_iter must be int.'
        assert num_iter>=0, 'num_iter cannot be negative.'
        self._num_iter = tf.constant(num_iter, dtype=tf.int32)

    #########################
    # Utility methods
    #########################

    def _boxplus_tf(self, x, y):
        """Check-node update (boxplus) for LLR inputs.

        Operations are performed element-wise.
        """
        x_in = tf.clip_by_value(x,
                                clip_value_min=-self._llr_max,
                                clip_value_max=self._llr_max)
        y_in = tf.clip_by_value(y,
                                clip_value_min=-self._llr_max,
                                clip_value_max=self._llr_max)

        # avoid division for numerical stability
        llr_out = tf.math.log(1 + tf.math.exp(x_in + y_in))
        llr_out -= tf.math.log(tf.math.exp(x_in) + tf.math.exp(y_in))

        return llr_out

    def _decode_bp(self, llr_ch, num_iter):
        """Iterative BP decoding function with LLR-values.

        Args:
            llr_ch (tf.float32): Tensor of shape `[batch_size, n]` containing
                the channel logits/llr values where `batch_size` denotes the
                batch-size.

            num_iter (int): Defining the number of decoder iteration
                (no early stopping used at the moment).
        Returns:
            `tf.float32`: Tensor of shape `[batch_size, k]` containing
            bit-wise soft-estimates (or hard-decided bit-values) of all
            information bits.
        """

        bs = tf.shape(llr_ch)[0]

        # store intermediate Tensors in TensorArray
        msg_l = tf.TensorArray(tf.float32,
                               size=num_iter*(self._n_stages+1),
                               dynamic_size=False,
                               clear_after_read=False)

        msg_r = tf.TensorArray(tf.float32,
                               size=num_iter*(self._n_stages+1),
                               dynamic_size=False,
                               clear_after_read=False)

        # init frozen positions with infinity
        msg_r_in = np.zeros([1, self._n])
        msg_r_in[:, self._frozen_pos] = self._llr_max
        # copy for all batch-samples
        msg_r_in = tf.tile(tf.constant(msg_r_in, tf.float32), [bs, 1])

        # perform decoding iterations
        for ind_it in tf.range(self._num_iter):
            # update left-to-right messages
            for ind_s in range(self._n_stages):
                # calc indices
                ind_range = np.arange(int(self._n/2))
                ind_1 = ind_range * 2 - np.mod(ind_range, 2**ind_s)
                ind_2 = ind_1 + 2**ind_s
                # simplify gather with concatenated outputs
                ind_inv = np.argsort(np.concatenate([ind_1, ind_2], axis=0))

                # load incoming l messages
                if ind_s==self._n_stages-1:
                    l1_in = tf.gather(llr_ch, ind_1, axis=1)
                    l2_in = tf.gather(llr_ch, ind_2, axis=1)
                elif ind_it==0:
                    l1_in = tf.zeros([bs, int(self._n/2)])
                    l2_in = tf.zeros([bs, int(self._n/2)])
                else:
                    l_in = msg_l.read((ind_s+1) + (ind_it-1)*(self._n_stages+1))
                    l1_in = tf.gather(l_in, ind_1, axis=1)
                    l2_in = tf.gather(l_in, ind_2, axis=1)

                # load incoming r messages
                if ind_s==0:
                    r1_in = tf.gather(msg_r_in, ind_1, axis=1)
                    r2_in = tf.gather(msg_r_in, ind_2, axis=1)
                else:
                    r_in = msg_r.read(ind_s + ind_it*(self._n_stages+1))
                    r1_in = tf.gather(r_in, ind_1, axis=1)
                    r2_in = tf.gather(r_in, ind_2, axis=1)

                r1_out = self._boxplus_tf(r1_in, l2_in + r2_in)
                r2_out = self._boxplus_tf(r1_in, l1_in) + r2_in

                # and re-concatenate output
                r_out = tf.concat([r1_out, r2_out], 1)
                r_out = tf.gather(r_out, ind_inv, axis=1)
                msg_r = msg_r.write((ind_s+1)
                                     + ind_it*(self._n_stages+1), r_out)

            # update right-to-left messages
            for ind_s in range(self._n_stages-1, -1, -1):
                ind_range = np.arange(int(self._n/2))
                ind_1 = ind_range * 2 - np.mod(ind_range, 2**ind_s)
                ind_2 = ind_1 + 2**ind_s
                ind_inv = np.argsort(np.concatenate([ind_1, ind_2], axis=0))

                # load messages
                if ind_s==self._n_stages-1:
                    l1_in = tf.gather(llr_ch, ind_1, axis=1)
                    l2_in = tf.gather(llr_ch, ind_2, axis=1)
                else:
                    l_in = msg_l.read((ind_s+1)+ind_it*(self._n_stages+1))
                    l1_in = tf.gather(l_in, ind_1, axis=1)
                    l2_in = tf.gather(l_in, ind_2, axis=1)

                if ind_s==0:
                    r1_in = tf.gather(msg_r_in, ind_1, axis=1)
                    r2_in = tf.gather(msg_r_in, ind_2, axis=1)
                else:
                    r_in = msg_r.read(ind_s + ind_it*(self._n_stages+1))
                    r1_in = tf.gather(r_in, ind_1, axis=1)
                    r2_in = tf.gather(r_in, ind_2, axis=1)

                # node update functions
                l1_out = self._boxplus_tf(l1_in, l2_in + r2_in)
                l2_out = self._boxplus_tf(r1_in, l1_in) + l2_in

                l_out = tf.concat([l1_out, l2_out], 1)
                l_out = tf.gather(l_out, ind_inv, axis=1)
                msg_l = msg_l.write(ind_s + ind_it*(self._n_stages+1), l_out)

        # recover u_hat
        u_hat = tf.gather(msg_l.read((num_iter-1)*(self._n_stages+1)),
                          self._info_pos,
                          axis=1)
        # if active, hard-decide output bits
        if self._hard_out:
            u_hat = tf.where(u_hat>0, 0., 1.)
        else: # re-transform soft output to logits (instead of llrs)
            u_hat = -1. * u_hat
        return u_hat

    #########################
    # Keras layer functions
    #########################

    def build(self, input_shape):
        """Build and check if shape of input is invalid."""
        assert (input_shape[-1]==self._n), "Invalid input shape"
        assert (len(input_shape)>=2), 'Inputs must have at least 2 dimensions.'

    def call(self, inputs):
        """Iterative BP decoding function.

        This function performs `num_iter` belief propagation decoding iterations
        and returns the estimated information bits.

        Args:
            inputs (tf.float32): Tensor of shape `[...,n]` containing the
                channel logits/llr values.

        Returns:
            `tf.float32`: Tensor of shape `[...,k]` containing
                bit-wise soft-estimates (or hard-decided bit-values) of all
                ``k`` information bits.

        Raises:
            ValueError: If ``inputs`` is not of shape `[..., n]`
                or `dtype` is not `output_dtype`.

            InvalidArgumentError: When rank(``inputs``)<2.

        Note:
            This function recursively unrolls the BP decoding graph, thus,
            for larger values of ``n`` or more iterations, building the
            decoding graph can become time and memory consuming.
        """

        tf.debugging.assert_type(inputs, self._output_dtype,
                                 "Invalid input dtype.")
        # internal calculations still in tf.float32
        inputs = tf.cast(inputs, tf.float32)

        # Reshape inputs to [-1, n]
        input_shape = inputs.shape
        new_shape = [-1, self._n]
        llr_ch = tf.reshape(inputs, new_shape)

        llr_ch = -1. * llr_ch # logits are converted into "true" llrs

        # and decode
        u_hat = self._decode_bp(llr_ch, self._num_iter)

        # and reconstruct input shape
        output_shape = input_shape.as_list()
        output_shape[-1] = self.k
        output_shape[0] = -1 # first dim can be dynamic (None)
        u_hat_reshape = tf.reshape(u_hat, output_shape)
        return tf.cast(u_hat_reshape, self._output_dtype)


[docs]class Polar5GDecoder(Layer):
    # pylint: disable=line-too-long
    """Polar5GDecoder(enc_polar, dec_type="SC", list_size=8, num_iter=20,return_crc_status=False, output_dtype=tf.float32, **kwargs)

    Wrapper for 5G compliant decoding including rate-recovery and CRC removal.

    The class inherits from the Keras layer class and can be used as layer in a
    Keras model.

    Parameters
    ----------
        enc_polar: Polar5GEncoder
            Instance of the :class:`~sionna.fec.polar.encoding.Polar5GEncoder`
            used for encoding including rate-matching.

        dec_type: str
            Defaults to `"SC"`. Defining the decoder to be used.
            Must be one of the following `{"SC", "SCL", "hybSCL", "BP"}`.

        list_size: int
            Defaults to 8. Defining the list size `iff` list-decoding is used.
            Only required for ``dec_types`` `{"SCL", "hybSCL"}`.

        num_iter: int
            Defaults to 20. Defining the number of BP iterations. Only required
            for ``dec_type`` `"BP"`.

        return_crc_status: bool
            Defaults to False. If True, the decoder additionally returns the
            CRC status indicating if a codeword was (most likely) correctly
            recovered.

        output_dtype: tf.DType
            Defaults to tf.float32. Defines the output datatype of the layer
            (internal precision remains tf.float32).

    Input
    -----
        inputs: [...,n], tf.float32
            2+D tensor containing the channel logits/llr values.

    Output
    ------

        b_hat : [...,k], tf.float32
            2+D tensor containing hard-decided estimations of all `k`
            information bits.

        crc_status : [...], tf.bool
            CRC status indicating if a codeword was (most likely) correctly
            recovered. This is only returned if ``return_crc_status`` is True.
            Note that false positives are possible.
    Raises
    ------
        AssertionError
            If ``enc_polar`` is not `Polar5GEncoder`.

        ValueError
            If ``dec_type`` is not `{"SC", "SCL", "SCL8", "SCL32", "hybSCL",
            "BP"}`.

        AssertionError
            If ``dec_type`` is not `str`.

        ValueError
            If ``inputs`` is not of shape `[..., n]` or `dtype` is not
            the same as ``output_dtype``.

        InvalidArgumentError
            When rank(``inputs``)<2.

    Note
    ----
        This layer supports the uplink and downlink Polar rate-matching scheme
        without `codeword segmentation`.

        Although the decoding `list size` is not provided by 3GPP
        [3GPPTS38212]_, the consortium has agreed on a `list size` of 8 for the
        5G decoding reference curves [Bioglio_Design]_.

        All list-decoders apply `CRC-aided` decoding, however, the non-list
        decoders (`"SC"` and `"BP"`) cannot materialize the CRC leading to an
        effective rate-loss.

    """

    def __init__(self,
                 enc_polar,
                 dec_type="SC",
                 list_size=8,
                 num_iter=20,
                 return_crc_status=False,
                 output_dtype=tf.float32,
                 **kwargs):

        if output_dtype not in (tf.float16, tf.float32, tf.float64):
            raise ValueError(
                'output_dtype must be {tf.float16, tf.float32, tf.float64}.')

        if output_dtype is not tf.float32:
            print('Note: decoder uses tf.float32 for internal calculations.')
        self._output_dtype = output_dtype

        super().__init__(dtype=output_dtype, **kwargs)

        assert isinstance(enc_polar, Polar5GEncoder), \
                                    "enc_polar must be Polar5GEncoder."
        assert isinstance(dec_type, str), "dec_type must be str."
        # list_size and num_iter are not checked here (done during decoder init)

        # Store internal attributes
        self._n_target = enc_polar.n_target
        self._k_target = enc_polar.k_target
        self._n_polar = enc_polar.n_polar
        self._k_polar = enc_polar.k_polar
        self._k_crc = enc_polar.enc_crc.crc_length
        self._bil = enc_polar._channel_type == "uplink"
        self._iil = enc_polar._channel_type == "downlink"
        self._llr_max = 100 # Internal max LLR value (for punctured positions)
        self._enc_polar = enc_polar
        self._dec_type = dec_type

        # Initialize the de-interleaver patterns
        self._init_interleavers()

        # Initialize decoder
        if dec_type=="SC":
            print("Warning: 5G Polar codes use an integrated CRC that " \
                  "cannot be materialized with SC decoding and, thus, " \
                  "causes a degraded performance. Please consider SCL " \
                  "decoding instead.")
            self._polar_dec = PolarSCDecoder(self._enc_polar.frozen_pos,
                                             self._n_polar)
        elif dec_type=="SCL":
            self._polar_dec = PolarSCLDecoder(self._enc_polar.frozen_pos,
                                self._n_polar,
                                crc_degree=self._enc_polar.enc_crc.crc_degree,
                                list_size=list_size,
                                ind_iil_inv = self.ind_iil_inv)
        elif dec_type=="hybSCL":
            self._polar_dec = PolarSCLDecoder(self._enc_polar.frozen_pos,
                                self._n_polar,
                                crc_degree=self._enc_polar.enc_crc.crc_degree,
                                list_size=list_size,
                                use_hybrid_sc=True,
                                ind_iil_inv = self.ind_iil_inv)
        elif dec_type=="BP":
            print("Warning: 5G Polar codes use an integrated CRC that " \
                  "cannot be materialized with BP decoding and, thus, " \
                  "causes a degraded performance. Please consider SCL " \
                  " decoding instead.")
            assert isinstance(num_iter, int), "num_iter must be int."
            assert num_iter > 0, "num_iter must be positive."
            self._num_iter = num_iter
            self._polar_dec = PolarBPDecoder(self._enc_polar.frozen_pos,
                                             self._n_polar,
                                             num_iter=num_iter,
                                             hard_out=True)
        else:
            raise ValueError("Unknown value for dec_type.")

        assert isinstance(return_crc_status, bool), \
                                            "return_crc_status must be bool."

        self._return_crc_status = return_crc_status
        if self._return_crc_status: # init crc decoder
            if dec_type in ("SCL", "hybSCL"):
                # re-use CRC decoder from list decoder
                self._dec_crc = self._polar_dec._crc_decoder
            else: # init new CRC decoder for BP and SC
                self._dec_crc = CRCDecoder(self._enc_polar._enc_crc)

    #########################################
    # Public methods and properties
    #########################################

    @property
    def k_target(self):
        """Number of information bits including rate-matching."""
        return self._k_target

    @property
    def n_target(self):
        """Codeword length including rate-matching."""
        return self._n_target

    @property
    def k_polar(self):
        """Number of information bits of mother Polar code."""
        return self._k_polar

    @property
    def n_polar(self):
        """Codeword length of mother Polar code."""
        return self._n_polar

    @property
    def frozen_pos(self):
        """Frozen positions for Polar decoding."""
        return self._frozen_pos

    @property
    def info_pos(self):
        """Information bit positions for Polar encoding."""
        return self._info_pos

    @property
    def llr_max(self):
        """Maximum LLR value for internal calculations."""
        return self._llr_max

    @property
    def dec_type(self):
        """Decoder type used for decoding as str."""
        return self._dec_type

    @property
    def polar_dec(self):
        """Decoder instance used for decoding."""
        return self._polar_dec

    @property
    def output_dtype(self):
        """Output dtype of decoder."""
        return self._output_dtype

    #########################
    # Utility methods
    #########################

    def _init_interleavers(self):
        """Initialize inverse interleaver patterns for rate-recovery."""

        # Channel interleaver
        ind_ch_int = self._enc_polar.channel_interleaver(
                                                np.arange(self._n_target))
        self.ind_ch_int_inv = np.argsort(ind_ch_int) # Find inverse perm

        # Sub-block interleaver
        ind_sub_int = self._enc_polar.subblock_interleaving(
                                                np.arange(self._n_polar))
        self.ind_sub_int_inv = np.argsort(ind_sub_int) # Find inverse perm

        # input bit interleaver
        if self._iil:
            self.ind_iil_inv = np.argsort(self._enc_polar.input_interleaver(
                                                np.arange(self._k_polar)))
        else:
            self.ind_iil_inv = None
    #########################
    # Keras layer functions
    #########################

    def build(self, input_shape):
        """Build and check if shape of input is invalid."""
        assert (input_shape[-1]==self._n_target), "Invalid input shape."
        assert (len(input_shape)>=2), 'Inputs must have at least 2 dimensions.'

    def call(self, inputs):
        """Polar decoding and rate-recovery for uplink 5G Polar codes.

        Args:
            inputs (tf.float32): Tensor of shape `[...,n]` containing the
                channel logits/llr values.

        Returns:
            `tf.float32`: Tensor of shape `[...,k]` containing
                hard-decided estimates of all ``k`` information bits.

        Raises:
            ValueError: If ``inputs`` is not of shape `[..., n]`
                or `dtype` is not `output_dtype`.

            InvalidArgumentError: When rank(``inputs``)<2.
        """

        tf.debugging.assert_type(inputs, self._output_dtype,
                                 "Invalid input dtype.")
        # internal calculations still in tf.float32
        inputs = tf.cast(inputs, tf.float32)

        # Reshape inputs to [-1, n]
        tf.debugging.assert_greater(tf.rank(inputs), 1)
        input_shape = inputs.shape
        new_shape = [-1, self._n_target]
        llr_ch = tf.reshape(inputs, new_shape)

        # Note: logits are not inverted here; this is done in the decoder itself

        # 1.) Undo channel interleaving
        if self._bil:
            llr_deint = tf.gather(llr_ch, self.ind_ch_int_inv, axis=1)
        else:
            llr_deint = llr_ch

        # 2.) Remove puncturing, shortening, repetition (see Sec. 5.4.1.2)
        # a) Puncturing: set LLRs to 0
        # b) Shortening: set LLRs to infinity
        # c) Repetition: combine LLRs
        if self._n_target >= self._n_polar:
            # Repetition coding
            # Add the last n_rep positions to the first llr positions
            n_rep = self._n_target - self._n_polar
            llr_1 = llr_deint[:,:n_rep]
            llr_2 = llr_deint[:,n_rep:self._n_polar]
            llr_3 = llr_deint[:,self._n_polar:]
            llr_dematched = tf.concat([llr_1+llr_3, llr_2], 1)
        else:
            if self._k_polar/self._n_target <= 7/16:
                # Puncturing
                # Append n_polar - n_target "zero" llrs to first positions
                llr_zero = tf.zeros([tf.shape(llr_deint)[0],
                                     self._n_polar-self._n_target])
                llr_dematched = tf.concat([llr_zero, llr_deint], 1)
            else:
                # Shortening
                # Append n_polar - n_target "-infinity" llrs to last positions
                # Remark: we still operate with logits here, thus the neg. sign
                llr_infty = -self._llr_max * tf.ones([tf.shape(llr_deint)[0],
                                                self._n_polar-self._n_target])
                llr_dematched = tf.concat([llr_deint, llr_infty], 1)

        # 3.) Remove subblock interleaving
        llr_dec = tf.gather(llr_dematched, self.ind_sub_int_inv, axis=1)

        # 4.) Run main decoder
        u_hat_crc = self._polar_dec(llr_dec)

        # 5.) Shortening should be implicitly recovered by decoder

        # 6.) Remove input bit interleaving for downlink channels only
        if self._iil:
            u_hat_crc = tf.gather(u_hat_crc, self.ind_iil_inv, axis=1)

        # 7.) Evaluate or remove CRC (and PC)
        if self._return_crc_status:
            # for compatibility with SC/BP, a dedicated CRC decoder is
            # used here (instead of accessing the interal SCL)
            u_hat, crc_status = self._dec_crc(u_hat_crc)
        else: # just remove CRC bits
            u_hat = u_hat_crc[:,:-self._k_crc]

        # And reconstruct input shape
        output_shape = input_shape.as_list()
        output_shape[-1] = self._k_target
        output_shape[0] = -1 # First dim can be dynamic (None)
        u_hat_reshape = tf.reshape(u_hat, output_shape)
        # and cast to output dtype
        u_hat_reshape = tf.cast(u_hat_reshape, dtype=self._output_dtype)

        if self._return_crc_status:
            # reconstruct CRC shape
            output_shape.pop() # remove last dimension
            crc_status = tf.reshape(crc_status, output_shape)
            crc_status = tf.cast(crc_status, dtype=self._output_dtype)
            return u_hat_reshape, crc_status

        else:
            return u_hat_reshape