Source code for gluonnlp.model.train

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# pylint: disable=wildcard-import
"""NLP training model."""

import mxnet as mx
import gluonnlp as nlp

from . import cache, embedding, language_model
from .cache import *
from .embedding import *
from .language_model import *

__all__ = language_model.__all__ + cache.__all__ + embedding.__all__ + ['get_cache_model']


[docs]def get_cache_model(name, dataset_name='wikitext-2', window=2000,
                    theta=0.6, lambdas=0.2, ctx=mx.cpu(), **kwargs):
    r"""Returns a cache model using a pre-trained language model.

    We implement the neural cache language model proposed in the following work::

        @article{grave2016improving,
        title={Improving neural language models with a continuous cache},
        author={Grave, Edouard and Joulin, Armand and Usunier, Nicolas},
        journal={ICLR},
        year={2017}
        }

    Parameters
    ----------
    name : str
        Name of the cache language model.
    dataset_name : str or None, default 'wikitext-2'.
        The dataset name on which the pre-trained model is trained.
        Options are 'wikitext-2'. If specified, then the returned vocabulary is extracted from
        the training set of the dataset.
        If None, then vocab is required, for specifying embedding weight size, and is directly
        returned.
    window : int
        Size of cache window
    theta : float
        The scala controls the flatness of the cache distribution
        that predict the next word as shown below:

        .. math::

            p_{cache} \propto \sum_{i=1}^{t-1} \mathbb{1}_{w=x_{i+1}} exp(\theta {h_t}^T h_i)

        where :math:`p_{cache}` is the cache distribution, :math:`\mathbb{1}` is
        the identity function, and :math:`h_i` is the output of timestep i.
    lambdas : float
        Linear scalar between only cache and vocab distribution, the formulation is as below:

        .. math::

            p = (1 - \lambda) p_{vocab} + \lambda p_{cache}

        where :math:`p_{vocab}` is the vocabulary distribution and :math:`p_{cache}`
        is the cache distribution.
    vocab : gluonnlp.Vocab or None, default None
        Vocabulary object to be used with the language model.
        Required when dataset_name is not specified.
    pretrained : bool, default False
        Whether to load the pre-trained weights for model.
    ctx : Context, default CPU
        The context in which to load the pre-trained weights.
    root : str, default '~/.mxnet/models'
        Location for keeping the pre-trained model parameters.

    Returns
    -------
    Block
        The model.
    """
    lm_model, vocab = nlp.model.get_model(name, dataset_name=dataset_name,
                                          pretrained=True, ctx=ctx, **kwargs)
    cache_cell = CacheCell(lm_model, len(vocab), window, theta, lambdas)
    return cache_cell