Source code for gluonnlp.model.language_model

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""Language models."""
__all__ = ['AWDRNN', 'StandardRNN', 'BigRNN', 'awd_lstm_lm_1150', 'awd_lstm_lm_600',
           'standard_lstm_lm_200', 'standard_lstm_lm_650', 'standard_lstm_lm_1500',
           'big_rnn_lm_2048_512']

import os

from mxnet.gluon import Block, nn, rnn, contrib
from mxnet import nd, cpu, autograd, sym
from mxnet.gluon.model_zoo import model_store

from . import train
from .utils import _load_vocab, _load_pretrained_params
from ..base import get_home_dir


[docs]class AWDRNN(train.AWDRNN): """AWD language model by salesforce. Reference: https://github.com/salesforce/awd-lstm-lm License: BSD 3-Clause Parameters ---------- mode : str The type of RNN to use. Options are 'lstm', 'gru', 'rnn_tanh', 'rnn_relu'. vocab_size : int Size of the input vocabulary. embed_size : int Dimension of embedding vectors. hidden_size : int Number of hidden units for RNN. num_layers : int Number of RNN layers. tie_weights : bool, default False Whether to tie the weight matrices of output dense layer and input embedding layer. dropout : float Dropout rate to use for encoder output. weight_drop : float Dropout rate to use on encoder h2h weights. drop_h : float Dropout rate to on the output of intermediate layers of encoder. drop_i : float Dropout rate to on the output of embedding. drop_e : float Dropout rate to use on the embedding layer. """ def __init__(self, mode, vocab_size, embed_size, hidden_size, num_layers, tie_weights, dropout, weight_drop, drop_h, drop_i, drop_e, **kwargs): super(AWDRNN, self).__init__(mode, vocab_size, embed_size, hidden_size, num_layers, tie_weights, dropout, weight_drop, drop_h, drop_i, drop_e, **kwargs)
[docs] def hybrid_forward(self, F, inputs, begin_state=None): # pylint: disable=arguments-differ """Implement forward computation. Parameters ----------- inputs : NDArray input tensor with shape `(sequence_length, batch_size)` when `layout` is "TNC". begin_state : list initial recurrent state tensor with length equals to num_layers. the initial state with shape `(1, batch_size, num_hidden)` Returns -------- out: NDArray output tensor with shape `(sequence_length, batch_size, input_size)` when `layout` is "TNC". out_states: list output recurrent state tensor with length equals to num_layers. the state with shape `(1, batch_size, num_hidden)` """ encoded = self.embedding(inputs) if not begin_state: if F == nd: begin_state = self.begin_state(batch_size=inputs.shape[1]) else: begin_state = self.begin_state(batch_size=0, func=sym.zeros) out_states = [] for i, (e, s) in enumerate(zip(self.encoder, begin_state)): encoded, state = e(encoded, s) out_states.append(state) if self._drop_h and i != len(self.encoder)-1: encoded = F.Dropout(encoded, p=self._drop_h, axes=(0,)) if self._dropout: encoded = F.Dropout(encoded, p=self._dropout, axes=(0,)) with autograd.predict_mode(): out = self.decoder(encoded) return out, out_states
[docs]class StandardRNN(train.StandardRNN): """Standard RNN language model. Parameters ---------- mode : str The type of RNN to use. Options are 'lstm', 'gru', 'rnn_tanh', 'rnn_relu'. vocab_size : int Size of the input vocabulary. embed_size : int Dimension of embedding vectors. hidden_size : int Number of hidden units for RNN. num_layers : int Number of RNN layers. dropout : float Dropout rate to use for encoder output. tie_weights : bool, default False Whether to tie the weight matrices of output dense layer and input embedding layer. """ def __init__(self, mode, vocab_size, embed_size, hidden_size, num_layers, dropout, tie_weights, **kwargs): if tie_weights: assert embed_size == hidden_size, 'Embedding dimension must be equal to ' \ 'hidden dimension in order to tie weights. ' \ 'Got: emb: {}, hid: {}.'.format(embed_size, hidden_size) super(StandardRNN, self).__init__(mode, vocab_size, embed_size, hidden_size, num_layers, dropout, tie_weights, **kwargs)
[docs] def hybrid_forward(self, F, inputs, begin_state=None): # pylint: disable=arguments-differ """Defines the forward computation. Arguments can be either :py:class:`NDArray` or :py:class:`Symbol`. Parameters ----------- inputs : NDArray input tensor with shape `(sequence_length, batch_size)` when `layout` is "TNC". begin_state : list initial recurrent state tensor with length equals to num_layers-1. the initial state with shape `(num_layers, batch_size, num_hidden)` Returns -------- out: NDArray output tensor with shape `(sequence_length, batch_size, input_size)` when `layout` is "TNC". out_states: list output recurrent state tensor with length equals to num_layers-1. the state with shape `(num_layers, batch_size, num_hidden)` """ encoded = self.embedding(inputs) if not begin_state: if F == nd: begin_state = self.begin_state(batch_size=inputs.shape[1]) else: begin_state = self.begin_state(batch_size=0, func=sym.zeros) encoded, state = self.encoder(encoded, begin_state) if self._dropout: encoded = nd.Dropout(encoded, p=self._dropout, axes=(0,)) out = self.decoder(encoded) return out, state
awd_lstm_lm_1150_hparams = { 'embed_size': 400, 'hidden_size': 1150, 'mode': 'lstm', 'num_layers': 3, 'tie_weights': True, 'dropout': 0.4, 'weight_drop': 0.5, 'drop_h': 0.2, 'drop_i': 0.65, 'drop_e': 0.1 } awd_lstm_lm_600_hparams = { 'embed_size': 200, 'hidden_size': 600, 'mode': 'lstm', 'num_layers': 3, 'tie_weights': True, 'dropout': 0.2, 'weight_drop': 0.2, 'drop_h': 0.1, 'drop_i': 0.3, 'drop_e': 0.05 } standard_lstm_lm_200_hparams = { 'embed_size': 200, 'hidden_size': 200, 'mode': 'lstm', 'num_layers': 2, 'tie_weights': True, 'dropout': 0.2 } standard_lstm_lm_650_hparams = { 'embed_size': 650, 'hidden_size': 650, 'mode': 'lstm', 'num_layers': 2, 'tie_weights': True, 'dropout': 0.5 } standard_lstm_lm_1500_hparams = { 'embed_size': 1500, 'hidden_size': 1500, 'mode': 'lstm', 'num_layers': 2, 'tie_weights': True, 'dropout': 0.65 } awd_lstm_lm_hparams = { 'awd_lstm_lm_1150': awd_lstm_lm_1150_hparams, 'awd_lstm_lm_600': awd_lstm_lm_600_hparams } standard_lstm_lm_hparams = { 'standard_lstm_lm_200': standard_lstm_lm_200_hparams, 'standard_lstm_lm_650': standard_lstm_lm_650_hparams, 'standard_lstm_lm_1500': standard_lstm_lm_1500_hparams } def _get_rnn_model(model_cls, model_name, dataset_name, vocab, pretrained, ctx, root, **kwargs): vocab = _load_vocab(dataset_name, vocab, root) kwargs['vocab_size'] = len(vocab) net = model_cls(**kwargs) if pretrained: _load_pretrained_params(net, model_name, dataset_name, root, ctx) return net, vocab
[docs]def awd_lstm_lm_1150(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(), root=os.path.join(get_home_dir(), 'models'), hparam_allow_override=False, **kwargs): r"""3-layer LSTM language model with weight-drop, variational dropout, and tied weights. Embedding size is 400, and hidden layer size is 1150. Parameters ---------- dataset_name : str or None, default None The dataset name on which the pre-trained model is trained. Options are 'wikitext-2'. If specified, then the returned vocabulary is extracted from the training set of the dataset. If None, then vocab is required, for specifying embedding weight size, and is directly returned. The pre-trained model achieves 73.32/69.74 ppl on Val and Test of wikitext-2 respectively. vocab : gluonnlp.Vocab or None, default None Vocab object to be used with the language model. Required when dataset_name is not specified. pretrained : bool, default False Whether to load the pre-trained weights for model. ctx : Context, default CPU The context in which to load the pre-trained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. hparam_allow_override : bool, default False If set to True, pre-defined hyper-parameters of the model (e.g. the number of layers, hidden units) can be overriden. Returns ------- gluon.Block, gluonnlp.Vocab """ predefined_args = awd_lstm_lm_hparams['awd_lstm_lm_1150'].copy() if not hparam_allow_override: mutable_args = frozenset(['dropout', 'weight_drop', 'drop_h', 'drop_i', 'drop_e']) assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) return _get_rnn_model(AWDRNN, 'awd_lstm_lm_1150', dataset_name, vocab, pretrained, ctx, root, **predefined_args)
[docs]def awd_lstm_lm_600(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(), root=os.path.join(get_home_dir(), 'models'), hparam_allow_override=False, **kwargs): r"""3-layer LSTM language model with weight-drop, variational dropout, and tied weights. Embedding size is 200, and hidden layer size is 600. Parameters ---------- dataset_name : str or None, default None The dataset name on which the pre-trained model is trained. Options are 'wikitext-2'. If specified, then the returned vocabulary is extracted from the training set of the dataset. If None, then vocab is required, for specifying embedding weight size, and is directly returned. The pre-trained model achieves 84.61/80.96 ppl on Val and Test of wikitext-2 respectively. vocab : gluonnlp.Vocab or None, default None Vocab object to be used with the language model. Required when dataset_name is not specified. pretrained : bool, default False Whether to load the pre-trained weights for model. ctx : Context, default CPU The context in which to load the pre-trained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. hparam_allow_override : bool, default False If set to True, pre-defined hyper-parameters of the model (e.g. the number of layers, hidden units) can be overriden. Returns ------- gluon.Block, gluonnlp.Vocab """ predefined_args = awd_lstm_lm_hparams['awd_lstm_lm_600'].copy() if not hparam_allow_override: mutable_args = frozenset(['dropout', 'weight_drop', 'drop_h', 'drop_i', 'drop_e']) assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) return _get_rnn_model(AWDRNN, 'awd_lstm_lm_600', dataset_name, vocab, pretrained, ctx, root, **predefined_args)
[docs]def standard_lstm_lm_200(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(), root=os.path.join(get_home_dir(), 'models'), hparam_allow_override=False, **kwargs): r"""Standard 2-layer LSTM language model with tied embedding and output weights. Both embedding and hidden dimensions are 200. Parameters ---------- dataset_name : str or None, default None The dataset name on which the pre-trained model is trained. Options are 'wikitext-2'. If specified, then the returned vocabulary is extracted from the training set of the dataset. If None, then vocab is required, for specifying embedding weight size, and is directly returned. The pre-trained model achieves 108.25/102.26 ppl on Val and Test of wikitext-2 respectively. vocab : gluonnlp.Vocab or None, default None Vocabulary object to be used with the language model. Required when dataset_name is not specified. pretrained : bool, default False Whether to load the pre-trained weights for model. ctx : Context, default CPU The context in which to load the pre-trained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. hparam_allow_override : bool, default False If set to True, pre-defined hyper-parameters of the model (e.g. the number of layers, hidden units) can be overriden. Returns ------- gluon.Block, gluonnlp.Vocab """ predefined_args = standard_lstm_lm_hparams['standard_lstm_lm_200'].copy() if not hparam_allow_override: mutable_args = frozenset(['dropout']) assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) return _get_rnn_model(StandardRNN, 'standard_lstm_lm_200', dataset_name, vocab, pretrained, ctx, root, **predefined_args)
[docs]def standard_lstm_lm_650(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(), root=os.path.join(get_home_dir(), 'models'), hparam_allow_override=False, **kwargs): r"""Standard 2-layer LSTM language model with tied embedding and output weights. Both embedding and hidden dimensions are 650. Parameters ---------- dataset_name : str or None, default None The dataset name on which the pre-trained model is trained. Options are 'wikitext-2'. If specified, then the returned vocabulary is extracted from the training set of the dataset. If None, then vocab is required, for specifying embedding weight size, and is directly returned. The pre-trained model achieves 98.96/93.90 ppl on Val and Test of wikitext-2 respectively. vocab : gluonnlp.Vocab or None, default None Vocabulary object to be used with the language model. Required when dataset_name is not specified. pretrained : bool, default False Whether to load the pre-trained weights for model. ctx : Context, default CPU The context in which to load the pre-trained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. hparam_allow_override : bool, default False If set to True, pre-defined hyper-parameters of the model (e.g. the number of layers, hidden units) can be overriden. Returns ------- gluon.Block, gluonnlp.Vocab """ predefined_args = standard_lstm_lm_hparams['standard_lstm_lm_650'].copy() if not hparam_allow_override: mutable_args = frozenset(['dropout']) assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) return _get_rnn_model(StandardRNN, 'standard_lstm_lm_650', dataset_name, vocab, pretrained, ctx, root, **predefined_args)
[docs]def standard_lstm_lm_1500(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(), root=os.path.join(get_home_dir(), 'models'), hparam_allow_override=False, **kwargs): r"""Standard 2-layer LSTM language model with tied embedding and output weights. Both embedding and hidden dimensions are 1500. Parameters ---------- dataset_name : str or None, default None The dataset name on which the pre-trained model is trained. Options are 'wikitext-2'. If specified, then the returned vocabulary is extracted from the training set of the dataset. If None, then vocab is required, for specifying embedding weight size, and is directly returned. The pre-trained model achieves 98.29/92.83 ppl on Val and Test of wikitext-2 respectively. vocab : gluonnlp.Vocab or None, default None Vocabulary object to be used with the language model. Required when dataset_name is not specified. pretrained : bool, default False Whether to load the pre-trained weights for model. ctx : Context, default CPU The context in which to load the pre-trained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. hparam_allow_override : bool, default False If set to True, pre-defined hyper-parameters of the model (e.g. the number of layers, hidden units) can be overriden. Returns ------- gluon.Block, gluonnlp.Vocab """ predefined_args = standard_lstm_lm_hparams['standard_lstm_lm_1500'].copy() if not hparam_allow_override: mutable_args = frozenset(['dropout']) assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) return _get_rnn_model(StandardRNN, 'standard_lstm_lm_1500', dataset_name, vocab, pretrained, ctx, root, **predefined_args)
model_store._model_sha1.update( {name: checksum for checksum, name in [ ('a416351377d837ef12d17aae27739393f59f0b82', 'standard_lstm_lm_1500_wikitext-2'), ('631f39040cd65b49f5c8828a0aba65606d73a9cb', 'standard_lstm_lm_650_wikitext-2'), ('b233c700e80fb0846c17fe14846cb7e08db3fd51', 'standard_lstm_lm_200_wikitext-2'), ('f9562ed05d9bcc7e1f5b7f3c81a1988019878038', 'awd_lstm_lm_1150_wikitext-2'), ('e952becc7580a0b5a6030aab09d0644e9a13ce18', 'awd_lstm_lm_600_wikitext-2'), ('6bb3e991eb4439fabfe26c129da2fe15a324e918', 'big_rnn_lm_2048_512_gbw') ]})
[docs]class BigRNN(Block): """Big language model with LSTMP for inference. Parameters ---------- vocab_size : int Size of the input vocabulary. embed_size : int Dimension of embedding vectors. hidden_size : int Number of hidden units for LSTMP. num_layers : int Number of LSTMP layers. projection_size : int Number of projection units for LSTMP. embed_dropout : float Dropout rate to use for embedding output. encode_dropout : float Dropout rate to use for encoder output. """ def __init__(self, vocab_size, embed_size, hidden_size, num_layers, projection_size, embed_dropout=0.0, encode_dropout=0.0, **kwargs): super(BigRNN, self).__init__(**kwargs) self._embed_size = embed_size self._hidden_size = hidden_size self._projection_size = projection_size self._num_layers = num_layers self._embed_dropout = embed_dropout self._encode_dropout = encode_dropout self._vocab_size = vocab_size with self.name_scope(): self.embedding = self._get_embedding() self.encoder = self._get_encoder() self.decoder = self._get_decoder() def _get_embedding(self): prefix = 'embedding0_' embedding = nn.HybridSequential(prefix=prefix) with embedding.name_scope(): embedding.add(nn.Embedding(self._vocab_size, self._embed_size, prefix=prefix)) if self._embed_dropout: embedding.add(nn.Dropout(self._embed_dropout)) return embedding def _get_encoder(self): block = rnn.HybridSequentialRNNCell() with block.name_scope(): for _ in range(self._num_layers): block.add(contrib.rnn.LSTMPCell(self._hidden_size, self._projection_size)) if self._encode_dropout: block.add(rnn.DropoutCell(self._encode_dropout)) return block def _get_decoder(self): output = nn.Dense(self._vocab_size, prefix='decoder0_') return output def begin_state(self, **kwargs): return self.encoder.begin_state(**kwargs)
[docs] def forward(self, inputs, begin_state): # pylint: disable=arguments-differ """Implement forward computation. Parameters ----------- inputs : NDArray input tensor with shape `(sequence_length, batch_size)` when `layout` is "TNC". begin_state : list initial recurrent state tensor with length equals to num_layers*2. For each layer the two initial states have shape `(batch_size, num_hidden)` and `(batch_size, num_projection)` Returns -------- out : NDArray output tensor with shape `(sequence_length, batch_size, vocab_size)` when `layout` is "TNC". out_states : list output recurrent state tensor with length equals to num_layers*2. For each layer the two initial states have shape `(batch_size, num_hidden)` and `(batch_size, num_projection)` """ encoded = self.embedding(inputs) length = inputs.shape[0] batch_size = inputs.shape[1] encoded, state = self.encoder.unroll(length, encoded, begin_state, layout='TNC', merge_outputs=True) encoded = encoded.reshape((-1, self._projection_size)) out = self.decoder(encoded) out = out.reshape((length, batch_size, -1)) return out, state
big_rnn_lm_2048_512_hparams = { 'embed_size': 512, 'hidden_size': 2048, 'projection_size': 512, 'num_layers': 1, 'embed_dropout': 0.1, 'encode_dropout': 0.1} big_rnn_lm_hparams = { 'big_rnn_lm_2048_512': big_rnn_lm_2048_512_hparams }
[docs]def big_rnn_lm_2048_512(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(), root=os.path.join(get_home_dir(), 'models'), hparam_allow_override=False, **kwargs): r"""Big 1-layer LSTMP language model. Both embedding and projection size are 512. Hidden size is 2048. Parameters ---------- dataset_name : str or None, default None The dataset name on which the pre-trained model is trained. Options are 'gbw'. If specified, then the returned vocabulary is extracted from the training set of the dataset. If None, then vocab is required, for specifying embedding weight size, and is directly returned. The pre-trained model achieves 44.05 ppl on Test of GBW dataset. vocab : gluonnlp.Vocab or None, default None Vocabulary object to be used with the language model. Required when dataset_name is not specified. pretrained : bool, default False Whether to load the pre-trained weights for model. ctx : Context, default CPU The context in which to load the pre-trained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. hparam_allow_override : bool, default False If set to True, pre-defined hyper-parameters of the model (e.g. the number of layers, hidden units) can be overriden. Returns ------- gluon.Block, gluonnlp.Vocab """ predefined_args = big_rnn_lm_hparams['big_rnn_lm_2048_512'].copy() if not hparam_allow_override: mutable_args = frozenset(['embed_dropout', 'encode_dropout']) assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) return _get_rnn_model(BigRNN, 'big_rnn_lm_2048_512', dataset_name, vocab, pretrained, ctx, root, **predefined_args)