Source code for gluonnlp.model.highway

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

"""Highway layer."""


__all__ = ['Highway']

from mxnet import gluon
from mxnet.gluon import nn
from ..initializer import HighwayBias


[docs]class Highway(gluon.HybridBlock):
    r"""Highway network.

    We implemented the highway network proposed in the following work::

        @article{srivastava2015highway,
          title={Highway networks},
          author={Srivastava, Rupesh Kumar and Greff, Klaus and Schmidhuber, J{\"u}rgen},
          journal={arXiv preprint arXiv:1505.00387},
          year={2015}
        }

    The full version of the work::

        @inproceedings{srivastava2015training,
         title={Training very deep networks},
         author={Srivastava, Rupesh K and Greff, Klaus and Schmidhuber, J{\"u}rgen},
         booktitle={Advances in neural information processing systems},
         pages={2377--2385},
         year={2015}
        }

    A Highway layer is defined as below:

    .. math::
        y = (1 - t) * x + t * f(A(x))

    which is a gated combination of a linear transform and a non-linear transform of its input,
    where :math:`x` is the input tensor, :math:`A` is a linear transformer,
    :math:`f` is an element-wise non-linear transformer,
    and :math:`t` is an element-wise transform gate, and :math:`1-t` refers to carry gate.

    Parameters
    ----------
    input_size : int
        The dimension of the input tensor.  We assume the input has shape ``(batch_size,
        input_size)``.
    num_layers : int
        The number of highway layers to apply to the input.
    activation : str, default 'relu'
        The non-linear activation function to use.
        If you don't specify anything, no activation is applied
        (ie. "linear" activation: `a(x) = x`).
    highway_bias : HighwayBias,
        default HighwayBias(nonlinear_transform_bias=0.0, transform_gate_bias=-2.0)
        The biases applied to the highway layer.
        We set the default according to the above original work.
    """

    def __init__(self,
                 input_size,
                 num_layers,
                 activation='relu',
                 highway_bias=HighwayBias(nonlinear_transform_bias=0.0, transform_gate_bias=-2.0),
                 **kwargs):
        super(Highway, self).__init__(**kwargs)
        self._input_size = input_size
        self._num_layers = num_layers

        with self.name_scope():
            self.hnet = nn.HybridSequential()
            with self.hnet.name_scope():
                for _ in range(self._num_layers):
                    self.hnet.add(nn.Dense(units=self._input_size * 2,
                                           in_units=self._input_size,
                                           bias_initializer=highway_bias,
                                           use_bias=True,
                                           flatten=False))
            self._activation = nn.Activation(activation)

[docs]    def hybrid_forward(self, F, inputs, **kwargs): # pylint: disable=arguments-differ
        r"""
        Forward computation for highway layer

        Parameters
        ----------
        inputs: NDArray
            The input tensor is of shape `(..., input_size)`.

        Returns
        ----------
        outputs: NDArray
            The output tensor is of the same shape with input tensor `(..., input_size)`.
        """
        current_input = inputs
        for layer in self.hnet:
            projected_input = layer(current_input)
            linear_transform = current_input
            nonlinear_transform, transform_gate = projected_input.split(num_outputs=2, axis=-1)
            nonlinear_transform = self._activation(nonlinear_transform)
            transform_gate = transform_gate.sigmoid()
            current_input = (1 - transform_gate) * linear_transform + \
                            transform_gate * nonlinear_transform
        return current_input