Source code for gluonnlp.model.elmo

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

"""ELMo."""
__all__ = ['ELMoBiLM', 'ELMoCharacterEncoder',
           'elmo_2x1024_128_2048cnn_1xhighway', 'elmo_2x2048_256_2048cnn_1xhighway',
           'elmo_2x4096_512_2048cnn_2xhighway']

import os
import mxnet as mx
from mxnet import gluon
from mxnet.gluon.model_zoo import model_store
from mxnet.gluon.model_zoo.model_store import get_model_file

from .convolutional_encoder import ConvolutionalEncoder
from .bilm_encoder import BiLMEncoder
from ..initializer.initializer import HighwayBias
from ..vocab.elmo import ELMoCharVocab
from ..base import get_home_dir


[docs]class ELMoCharacterEncoder(gluon.HybridBlock): r"""ELMo character encoder Compute context-free character-based token representation with character-level convolution. This encoder has input character ids of shape (batch_size, sequence_length, max_character_per_word) and returns (batch_size, sequence_length, embedding_size). Parameters ---------- output_size : int The output dimension after conducting the convolutions and max pooling, and applying highways, as well as linear projection. filters : list of tuple List of tuples representing the settings for convolution layers. Each element is (ngram_filter_size, num_filters). char_embed_size : int The input dimension to the encoder. num_highway : int The number of layers of the Highway layer. conv_layer_activation : str Activation function to be used after convolutional layer. max_chars_per_token : int The maximum number of characters of a token. char_vocab_size : int Size of character-level vocabulary. """ def __init__(self, output_size, filters, char_embed_size, num_highway, conv_layer_activation, max_chars_per_token, char_vocab_size, **kwargs): super(ELMoCharacterEncoder, self).__init__(**kwargs) self._output_size = output_size self._char_embed_size = char_embed_size self._filters = filters ngram_filter_sizes = [] num_filters = [] for width, num in filters: ngram_filter_sizes.append(width) num_filters.append(num) self._num_highway = num_highway self._conv_layer_activation = conv_layer_activation self._max_chars_per_token = max_chars_per_token with self.name_scope(): self._char_embedding = gluon.nn.Embedding(char_vocab_size, self._char_embed_size) self._convolutions = ConvolutionalEncoder(embed_size=self._char_embed_size, num_filters=tuple(num_filters), ngram_filter_sizes=tuple(ngram_filter_sizes), conv_layer_activation=conv_layer_activation, num_highway=self._num_highway, highway_bias=HighwayBias( nonlinear_transform_bias=0.0, transform_gate_bias=1.0), output_size=self._output_size)
[docs] def hybrid_forward(self, F, inputs): # pylint: disable=arguments-differ """ Compute context insensitive token embeddings for ELMo representations. Parameters ---------- inputs : NDArray Shape (batch_size, sequence_length, max_character_per_token) of character ids representing the current batch. Returns ------- token_embedding : NDArray Shape (batch_size, sequence_length, embedding_size) with context insensitive token representations. """ # the character id embedding # (batch_size * sequence_length, max_chars_per_token, embed_dim) character_embedding = self._char_embedding(inputs.reshape((-1, self._max_chars_per_token))) character_embedding = F.transpose(character_embedding, axes=(1, 0, 2)) token_embedding = self._convolutions(character_embedding) out_shape_ref = inputs.slice_axis(axis=-1, begin=0, end=1) out_shape_ref = out_shape_ref.broadcast_axes(axis=(2,), size=(self._output_size)) return token_embedding.reshape_like(out_shape_ref)
[docs]class ELMoBiLM(gluon.HybridBlock): r"""ELMo Bidirectional language model Run a pre-trained bidirectional language model, outputting the weighted ELMo representation. We implement the ELMo Bidirectional language model (BiLm) proposed in the following work:: @inproceedings{Peters:2018, author={Peters, Matthew E. and Neumann, Mark and Iyyer, Mohit and Gardner, Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke}, title={Deep contextualized word representations}, booktitle={Proc. of NAACL}, year={2018} } Parameters ---------- rnn_type : str The type of RNN cell to use. The option for pre-trained models is 'lstmpc'. output_size : int The output dimension after conducting the convolutions and max pooling, and applying highways, as well as linear projection. filters : list of tuple List of tuples representing the settings for convolution layers. Each element is (ngram_filter_size, num_filters). char_embed_size : int The input dimension to the encoder. char_vocab_size : int Size of character-level vocabulary. num_highway : int The number of layers of the Highway layer. conv_layer_activation : str Activation function to be used after convolutional layer. max_chars_per_token : int The maximum number of characters of a token. input_size : int The initial input size of in the RNN cell. hidden_size : int The hidden size of the RNN cell. proj_size : int The projection size of each LSTMPCellWithClip cell num_layers : int The number of RNN cells. cell_clip : float Clip cell state between [-cellclip, cell_clip] in LSTMPCellWithClip cell proj_clip : float Clip projection between [-projclip, projclip] in LSTMPCellWithClip cell skip_connection : bool Whether to add skip connections (add RNN cell input to output) """ def __init__(self, rnn_type, output_size, filters, char_embed_size, char_vocab_size, num_highway, conv_layer_activation, max_chars_per_token, input_size, hidden_size, proj_size, num_layers, cell_clip, proj_clip, skip_connection=True, **kwargs): super(ELMoBiLM, self).__init__(**kwargs) self._rnn_type = rnn_type self._output_size = output_size self._filters = filters self._char_embed_size = char_embed_size self._char_vocab_size = char_vocab_size self._num_highway = num_highway self._conv_layer_activation = conv_layer_activation self._max_chars_per_token = max_chars_per_token self._input_size = input_size self._hidden_size = hidden_size self._proj_size = proj_size self._num_layers = num_layers self._cell_clip = cell_clip self._proj_clip = proj_clip self._skip_connection = skip_connection if not self._skip_connection: raise NotImplementedError with self.name_scope(): self._elmo_char_encoder = ELMoCharacterEncoder(self._output_size, self._filters, self._char_embed_size, self._num_highway, self._conv_layer_activation, self._max_chars_per_token, self._char_vocab_size) self._elmo_lstm = BiLMEncoder(mode=self._rnn_type, input_size=self._input_size, hidden_size=self._hidden_size, proj_size=self._proj_size, num_layers=self._num_layers, cell_clip=self._cell_clip, proj_clip=self._proj_clip) def begin_state(self, func, **kwargs): return self._elmo_lstm.begin_state(func, **kwargs)
[docs] def hybrid_forward(self, F, inputs, states=None, mask=None): # pylint: disable=arguments-differ """ Parameters ---------- inputs : NDArray Shape (batch_size, sequence_length, max_character_per_token) of character ids representing the current batch. states : (list of list of NDArray, list of list of NDArray) The states. First tuple element is the forward layer states, while the second is the states from backward layer. Each is a list of states for each layer. The state of each layer has a list of two initial tensors with shape (batch_size, proj_size) and (batch_size, hidden_size). mask : NDArray Shape (batch_size, sequence_length) with sequence mask. Returns ------- output : list of NDArray A list of activations at each layer of the network, each of shape (batch_size, sequence_length, embedding_size) states : (list of list of NDArray, list of list of NDArray) The states. First tuple element is the forward layer states, while the second is the states from backward layer. Each is a list of states for each layer. The state of each layer has a list of two initial tensors with shape (batch_size, proj_size) and (batch_size, hidden_size). """ type_representation = self._elmo_char_encoder(inputs) type_representation = type_representation.transpose(axes=(1, 0, 2)) lstm_outputs, states = self._elmo_lstm(type_representation, states, mask) lstm_outputs = lstm_outputs.transpose(axes=(0, 2, 1, 3)) type_representation = type_representation.transpose(axes=(1, 0, 2)) # Prepare the output. The first layer is duplicated. output = F.concat(*[type_representation, type_representation], dim=-1) if mask is not None: output = output * mask.expand_dims(axis=-1) output = [output] output.extend([layer_activations.squeeze(axis=0) for layer_activations in F.split(lstm_outputs, self._num_layers, axis=0)]) return output, states
model_store._model_sha1.update( {name: checksum for checksum, name in [ ('8c9257d9153436e9eb692f9ec48d8ee07e2120f8', 'elmo_2x1024_128_2048cnn_1xhighway_gbw'), ('85eab56a3c90c6866dd8d13b50449934be58a2e6', 'elmo_2x2048_256_2048cnn_1xhighway_gbw'), ('79af623840c13b10cb891d20c207afc483ab27b9', 'elmo_2x4096_512_2048cnn_2xhighway_5bw'), ('5608a09f33c52e5ab3f043b1793481ab448a0347', 'elmo_2x4096_512_2048cnn_2xhighway_gbw') ]}) def _get_elmo_model(model_cls, model_name, dataset_name, pretrained, ctx, root, **kwargs): vocab = ELMoCharVocab() if 'char_vocab_size' not in kwargs: kwargs['char_vocab_size'] = len(vocab) net = model_cls(**kwargs) if pretrained: model_file = get_model_file('_'.join([model_name, dataset_name]), root=root) net.load_parameters(model_file, ctx=ctx) return net, vocab
[docs]def elmo_2x1024_128_2048cnn_1xhighway(dataset_name=None, pretrained=False, ctx=mx.cpu(), root=os.path.join(get_home_dir(), 'models'), **kwargs): r"""ELMo 2-layer BiLSTM with 1024 hidden units, 128 projection size, 1 highway layer. Parameters ---------- dataset_name : str or None, default None The dataset name on which the pre-trained model is trained. Options are 'gbw'. pretrained : bool, default False Whether to load the pre-trained weights for model. ctx : Context, default CPU The context in which to load the pre-trained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. Returns ------- gluon.Block """ predefined_args = {'rnn_type': 'lstmpc', 'output_size': 128, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'char_embed_size': 16, 'num_highway': 1, 'conv_layer_activation': 'relu', 'max_chars_per_token': 50, 'input_size': 128, 'hidden_size': 1024, 'proj_size': 128, 'num_layers': 2, 'cell_clip': 3, 'proj_clip': 3, 'skip_connection': True} assert all((k not in kwargs) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) return _get_elmo_model(ELMoBiLM, 'elmo_2x1024_128_2048cnn_1xhighway', dataset_name, pretrained, ctx, root, **predefined_args)
[docs]def elmo_2x2048_256_2048cnn_1xhighway(dataset_name=None, pretrained=False, ctx=mx.cpu(), root=os.path.join(get_home_dir(), 'models'), **kwargs): r"""ELMo 2-layer BiLSTM with 2048 hidden units, 256 projection size, 1 highway layer. Parameters ---------- dataset_name : str or None, default None The dataset name on which the pre-trained model is trained. Options are 'gbw'. pretrained : bool, default False Whether to load the pre-trained weights for model. ctx : Context, default CPU The context in which to load the pre-trained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. Returns ------- gluon.Block """ predefined_args = {'rnn_type': 'lstmpc', 'output_size': 256, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'char_embed_size': 16, 'num_highway': 1, 'conv_layer_activation': 'relu', 'max_chars_per_token': 50, 'input_size': 256, 'hidden_size': 2048, 'proj_size': 256, 'num_layers': 2, 'cell_clip': 3, 'proj_clip': 3, 'skip_connection': True} assert all((k not in kwargs) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) return _get_elmo_model(ELMoBiLM, 'elmo_2x2048_256_2048cnn_1xhighway', dataset_name, pretrained, ctx, root, **predefined_args)
[docs]def elmo_2x4096_512_2048cnn_2xhighway(dataset_name=None, pretrained=False, ctx=mx.cpu(), root=os.path.join(get_home_dir(), 'models'), **kwargs): r"""ELMo 2-layer BiLSTM with 4096 hidden units, 512 projection size, 2 highway layer. Parameters ---------- dataset_name : str or None, default None The dataset name on which the pre-trained model is trained. Options are 'gbw' and '5bw'. pretrained : bool, default False Whether to load the pre-trained weights for model. ctx : Context, default CPU The context in which to load the pre-trained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. Returns ------- gluon.Block """ predefined_args = {'rnn_type': 'lstmpc', 'output_size': 512, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'char_embed_size': 16, 'num_highway': 2, 'conv_layer_activation': 'relu', 'max_chars_per_token': 50, 'input_size': 512, 'hidden_size': 4096, 'proj_size': 512, 'num_layers': 2, 'cell_clip': 3, 'proj_clip': 3, 'skip_connection': True} assert all((k not in kwargs) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) return _get_elmo_model(ELMoBiLM, 'elmo_2x4096_512_2048cnn_2xhighway', dataset_name, pretrained, ctx, root, **predefined_args)