Source code for gluonnlp.embedding.token_embedding

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# pylint: disable=consider-iterating-dictionary, too-many-lines
"""Text token embedding."""

__all__ = [
    'register', 'create', 'list_sources', 'TokenEmbedding', 'GloVe',
    'FastText', 'Word2Vec'
]

import io
import logging
import os
import warnings

import numpy as np
from mxnet import nd, registry, cpu
from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url

from .. import _constants as C
from ..base import get_home_dir
from ..data.utils import DefaultLookupDict
from ..model.train import FasttextEmbeddingModel

UNK_IDX = 0
ENCODING = 'utf8'
INIT_UNKNOWN_VEC = nd.zeros

[docs]def register(embedding_cls):
    """Registers a new token embedding.


    Once an embedding is registered, we can create an instance of this embedding with
    :func:`~gluonnlp.embedding.create`.


    Examples
    --------
    >>> @gluonnlp.embedding.register
    ... class MyTextEmbed(gluonnlp.embedding.TokenEmbedding):
    ...     def __init__(self, source='my_pretrain_file'):
    ...         pass
    >>> embed = gluonnlp.embedding.create('MyTextEmbed')
    >>> print(type(embed))
    <class 'gluonnlp.embedding.token_embedding.MyTextEmbed'>
    """

    register_text_embedding = registry.get_register_func(TokenEmbedding, 'token embedding')
    return register_text_embedding(embedding_cls)


[docs]def create(embedding_name, **kwargs):
    """Creates an instance of token embedding.


    Creates a token embedding instance by loading embedding vectors from an externally hosted
    pre-trained token embedding file, such as those of GloVe and FastText. To get all the valid
    `embedding_name` and `source`, use :func:`gluonnlp.embedding.list_sources`.


    Parameters
    ----------
    embedding_name : str
        The token embedding name (case-insensitive).
    kwargs : dict
        All other keyword arguments are passed to the initializer of token
        embedding class. For example `create(embedding_name='fasttext',
        source='wiki.simple', load_ngrams=True)` will return
        `FastText(source='wiki.simple', load_ngrams=True)`.


    Returns
    -------
    An instance of :class:`gluonnlp.embedding.TokenEmbedding`:
        A token embedding instance that loads embedding vectors from an externally hosted
        pre-trained token embedding file.
    """

    create_text_embedding = registry.get_create_func(TokenEmbedding, 'token embedding')
    return create_text_embedding(embedding_name, **kwargs)


[docs]def list_sources(embedding_name=None):
    """Get valid token embedding names and their pre-trained file names.


    To load token embedding vectors from an externally hosted pre-trained token embedding file,
    such as those of GloVe and FastText, one should use
    `gluonnlp.embedding.create(embedding_name, source)`. This method returns all the
    valid names of `source` for the specified `embedding_name`. If `embedding_name` is set to
    None, this method returns all the valid names of `embedding_name` with their associated
    `source`.


    Parameters
    ----------
    embedding_name : str or None, default None
        The pre-trained token embedding name.


    Returns
    -------
    dict or list:
        A list of all the valid pre-trained token embedding file names (`source`) for the
        specified token embedding name (`embedding_name`). If the text embedding name is set to
        None, returns a dict mapping each valid token embedding name to a list of valid pre-trained
        files (`source`). They can be plugged into
        `gluonnlp.embedding.create(embedding_name, source)`.
    """

    text_embedding_reg = registry.get_registry(TokenEmbedding)

    if embedding_name is not None:
        embedding_name = embedding_name.lower()
        if embedding_name not in text_embedding_reg:
            raise KeyError('Cannot find `embedding_name` {}. Use '
                           '`list_sources(embedding_name=None).keys()` to get all the valid'
                           'embedding names.'.format(embedding_name))
        return list(text_embedding_reg[embedding_name].source_file_hash.keys())
    else:
        return {embedding_name: list(embedding_cls.source_file_hash.keys())
                for embedding_name, embedding_cls in registry.get_registry(TokenEmbedding).items()}


[docs]class TokenEmbedding:
    """Token embedding base class.

    To load token embedding from an externally hosted pre-trained token embedding file, such as
    those of GloVe and FastText, use :func:`gluonnlp.embedding.create`.
    To get all the available `embedding_name` and `source`, use
    :func:`gluonnlp.embedding.list_sources`.

    Alternatively, to load embedding vectors from a custom pre-trained token embedding file, use
    :func:`gluonnlp.embedding.TokenEmbedding.from_file`.

    If `unknown_token` is None, looking up unknown tokens results in KeyError.
    Otherwise, for every unknown token, if its representation `self.unknown_token` is encountered
    in the pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained
    token embedding vector loaded from the file; otherwise, index 0 of `self.idx_to_vec` maps to
    the token embedding vector initialized by `init_unknown_vec`.

    If a token is encountered multiple times in the pre-trained token embedding file, only the
    first-encountered token embedding vector will be loaded and the rest will be skipped.

    Parameters
    ----------
    unknown_token : hashable object or None, default '<unk>'
        Any unknown token will be replaced by unknown_token and consequently
        will be indexed as the same representation. Only used if oov_imputer is
        not specified.
    init_unknown_vec : callback, default nd.zeros
        The callback used to initialize the embedding vector for the unknown
        token. Only used if `unknown_token` is not None and `idx_to_token` is
        not None and does not contain `unknown_vec`.
    allow_extend : bool, default False
        If True, embedding vectors for previously unknown words can be added
        via token_embedding[tokens] = vecs. If False, only vectors for known
        tokens can be updated.
    unknown_lookup : object subscriptable with list of tokens returning nd.NDarray, default None
        If not None, the TokenEmbedding obtains embeddings for unknown tokens
        automatically from `unknown_lookup[unknown_tokens]`. For example, in a
        FastText model, embeddings for unknown tokens can be computed from the
        subword information.
    idx_to_token : list of str or None, default None
        If not None, a list of tokens for which the `idx_to_vec` argument
        provides embeddings. The list indices and the indices of `idx_to_vec`
        must be aligned.
        If `idx_to_token` is not None, `idx_to_vec` must not be None either.
        If `idx_to_token` is None, an empty TokenEmbedding object is created.
        If `allow_extend` is True, tokens and their embeddings can be added to
        the TokenEmbedding at a later stage.
    idx_to_vec : mxnet.ndarray.NDArray or None, default None
        If not None, a NDArray containing embeddings for the tokens specified
        in `idx_to_token`. The first dimension of `idx_to_vec` must be aligned
        with `idx_to_token`.
        If `idx_to_vec` is not None, `idx_to_token` must not be None either.
        If `idx_to_vec` is None, an empty TokenEmbedding object is created.
        If `allow_extend` is True, tokens and their embeddings can be added to
        the TokenEmbedding at a later stage.
        No copy of the idx_to_vec array is made as long as unknown_token is
        None or an embedding for unknown_token is specified in `idx_to_vec`.

    """

    def __init__(self, unknown_token=C.UNK_TOKEN, init_unknown_vec=INIT_UNKNOWN_VEC,
                 allow_extend=False, unknown_lookup=None, idx_to_token=None,
                 idx_to_vec=None):
        unknown_index = None

        # With pre-specified tokens and vectors
        if idx_to_vec is not None or idx_to_token is not None:
            idx_to_token = idx_to_token[:]

            # Sanity checks
            if idx_to_vec is None or idx_to_token is None:
                raise ValueError('Must specify either none or both of '
                                 'idx_to_token and idx_to_vec.')
            if idx_to_vec.shape[0] != len(idx_to_token):
                raise ValueError('idx_to_token and idx_to_vec must contain '
                                 'the same number of tokens and embeddings respectively.')
            if unknown_token is not None:
                try:
                    unknown_index = idx_to_token.index(unknown_token)
                    if init_unknown_vec is not None:
                        logging.info('Ignoring init_unknown_vec as idx_to_vec is specified')
                except ValueError:
                    if init_unknown_vec is not None:
                        idx_to_token.insert(0, unknown_token)
                        idx_to_vec = nd.concat(init_unknown_vec((1, idx_to_vec.shape[1])),
                                               idx_to_vec, dim=0)
                        unknown_index = 0
                    else:
                        raise ValueError('unknown_token "{}" is not part of idx_to_vec but '
                                         'init_unknown_vec is None. '
                                         'You must provide either of them.'.format(unknown_token))

            # Initialization
            self._unknown_token = unknown_token
            self._init_unknown_vec = init_unknown_vec
            self._allow_extend = allow_extend
            self._unknown_lookup = unknown_lookup

            self._idx_to_token = idx_to_token
            self._idx_to_vec = idx_to_vec

        # Empty token-embedding
        else:
            # Initialization
            self._unknown_token = unknown_token
            if self._unknown_token is not None:
                unknown_index = UNK_IDX
            self._init_unknown_vec = init_unknown_vec
            self._allow_extend = allow_extend
            self._unknown_lookup = unknown_lookup

            assert UNK_IDX == 0
            self._idx_to_token = [unknown_token] if unknown_token else []
            self._idx_to_vec = None

        # Initialization of token_to_idx mapping
        if self._unknown_token:
            assert unknown_index is not None
            self._token_to_idx = DefaultLookupDict(unknown_index)
        else:
            self._token_to_idx = {}
        self._token_to_idx.update((token, idx) for idx, token in enumerate(self._idx_to_token))

    @staticmethod
    def _get_file_url(cls_name, source_file_hash, source):
        namespace = 'gluon/embeddings/{}'.format(cls_name)
        return _get_repo_file_url(namespace, source_file_hash[source][0])

    @classmethod
    def _get_file_path(cls, source_file_hash, embedding_root, source):
        cls_name = cls.__name__.lower()
        embedding_root = os.path.expanduser(embedding_root)
        url = cls._get_file_url(cls_name, source_file_hash, source)

        embedding_dir = os.path.join(embedding_root, cls_name)

        pretrained_file_name, expected_file_hash = source_file_hash[source]
        pretrained_file_path = os.path.join(embedding_dir, pretrained_file_name)

        if not os.path.exists(pretrained_file_path) \
           or not check_sha1(pretrained_file_path, expected_file_hash):
            print('Embedding file {} is not found. Downloading from Gluon Repository. '
                  'This may take some time.'.format(pretrained_file_name))
            download(url, pretrained_file_path, sha1_hash=expected_file_hash)

        return pretrained_file_path

    @staticmethod
    def _load_embedding(pretrained_file_path, elem_delim, unknown_token,
                        init_unknown_vec, encoding=ENCODING):
        """Load embedding vectors from a pre-trained token embedding file.

        Both text files and TokenEmbedding serialization files are supported.
        elem_delim and encoding are ignored for non-text files.

        For every unknown token, if its representation `self.unknown_token` is encountered in the
        pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained token
        embedding vector loaded from the file; otherwise, index 0 of `self.idx_to_vec` maps to the
        text embedding vector initialized by `self._init_unknown_vec`.

        If a token is encountered multiple times in the pre-trained text embedding file, only the
        first-encountered token embedding vector will be loaded and the rest will be skipped.

        """

        pretrained_file_path = os.path.expanduser(pretrained_file_path)

        if not os.path.isfile(pretrained_file_path):
            raise ValueError('`pretrained_file_path` must be a valid path '
                             'to the pre-trained token embedding file.')

        logging.info('Loading pre-trained token embedding vectors from %s',
                     pretrained_file_path)

        if pretrained_file_path.endswith('.npz'):
            return TokenEmbedding._load_embedding_serialized(
                pretrained_file_path=pretrained_file_path,
                unknown_token=unknown_token,
                init_unknown_vec=init_unknown_vec)
        else:
            return TokenEmbedding._load_embedding_txt(
                pretrained_file_path=pretrained_file_path,
                elem_delim=elem_delim,
                unknown_token=unknown_token,
                init_unknown_vec=init_unknown_vec,
                encoding=encoding)


    @staticmethod
    def _load_embedding_txt(pretrained_file_path, elem_delim, unknown_token,
                            init_unknown_vec, encoding=ENCODING):
        """Load embedding vectors from a pre-trained token embedding file.

        Returns idx_to_token, idx_to_vec and unknown_token suitable for the
        TokenEmbedding constructor.

        For every unknown token, if its representation `unknown_token` is encountered in the
        pre-trained token embedding file, index 0 of `idx_to_vec` maps to the pre-trained token
        embedding vector loaded from the file; otherwise, index 0 of `idx_to_vec` maps to the
        text embedding vector initialized by `init_unknown_vec`.

        If a token is encountered multiple times in the pre-trained text embedding file, only the
        first-encountered token embedding vector will be loaded and the rest will be skipped.

        """
        idx_to_token = [unknown_token] if unknown_token else []
        unk_idx = None
        if unknown_token:
            unk_idx = 0

        vec_len = None
        all_elems = []
        tokens = set()
        loaded_unknown_vec = None
        with io.open(pretrained_file_path, 'rb') as f:
            for line_num, line in enumerate(f):
                try:
                    line = line.decode(encoding)  # pytype: disable=attribute-error
                except ValueError:
                    warnings.warn('line {} in {}: failed to decode. Skipping.'
                                  .format(line_num, pretrained_file_path))
                    continue

                elems = line.rstrip().split(elem_delim)

                assert len(elems) > 1, 'line {} in {}: unexpected data format.'.format(
                    line_num, pretrained_file_path)

                token, elems = elems[0], [float(i) for i in elems[1:]]

                if loaded_unknown_vec is None and token == unknown_token:
                    loaded_unknown_vec = elems
                    tokens.add(unknown_token)
                elif token in tokens:
                    warnings.warn('line {} in {}: duplicate embedding found for '
                                  'token "{}". Skipped.'.format(line_num, pretrained_file_path,
                                                                token))
                elif len(elems) == 1 and line_num == 0:
                    warnings.warn('line {} in {}: skipped likely header line.'
                                  .format(line_num, pretrained_file_path))
                else:
                    if not vec_len:
                        vec_len = len(elems)
                        if unknown_token:
                            # Reserve a vector slot for the unknown token at the very beggining
                            # because the unknown token index is 0.
                            assert len(all_elems) == 0
                            all_elems.extend([0] * vec_len)
                    else:
                        assert len(elems) == vec_len, \
                            'line {} in {}: found vector of inconsistent dimension for token ' \
                            '"{}". expected dim: {}, found: {}'.format(line_num,
                                                                       pretrained_file_path,
                                                                       token, vec_len, len(elems))
                    all_elems.extend(elems)
                    idx_to_token.append(token)
                    tokens.add(token)

        idx_to_vec = nd.array(all_elems).reshape((-1, vec_len))

        if unknown_token:
            if loaded_unknown_vec is None:
                idx_to_vec[unk_idx] = init_unknown_vec(shape=vec_len)
            else:
                idx_to_vec[unk_idx] = nd.array(loaded_unknown_vec)

        return idx_to_token, idx_to_vec, unknown_token

    @staticmethod
    def _load_embedding_serialized(pretrained_file_path, unknown_token, init_unknown_vec):
        """Load embedding vectors from a pre-trained token embedding file.

        Returns idx_to_token, idx_to_vec and unknown_token suitable for the
        TokenEmbedding constructor.

        ValueError is raised if a token occurs multiple times.
        """
        deserialized_embedding = TokenEmbedding.deserialize(pretrained_file_path)

        idx_to_token = deserialized_embedding.idx_to_token
        if len(set(idx_to_token)) != len(idx_to_token):
            raise ValueError('Serialized embedding contains duplicate tokens.')
        idx_to_vec = deserialized_embedding.idx_to_vec
        vec_len = idx_to_vec.shape[1]
        loaded_unknown_vec = False
        if deserialized_embedding.unknown_token:
            if not unknown_token:
                # If the TokenEmbedding shall not have an unknown token but the
                # serialized file provided one, delete the provided one.
                unk_idx = deserialized_embedding.token_to_idx[
                    deserialized_embedding.unknown_token]
                assert unk_idx >= 0
                if unk_idx == 0:
                    idx_to_token = idx_to_token[1:]
                    idx_to_vec = idx_to_vec[1:]
                else:
                    idx_to_token = idx_to_token[:unk_idx] + idx_to_token[unk_idx + 1:]
                    idx_to_vec = nd.concat(idx_to_vec[:unk_idx], idx_to_vec[unk_idx + 1:], dim=0)
            else:
                # If the TokenEmbedding shall have an unknown token and the
                # serialized file provided one, replace the representation.
                unk_idx = deserialized_embedding.token_to_idx[
                    deserialized_embedding.unknown_token]
                idx_to_token[unk_idx] = unknown_token
                loaded_unknown_vec = True
        else:
            if unknown_token and unknown_token not in idx_to_token:
                # If the TokenEmbedding shall have an unknown token but the
                # serialized file didn't provided one, insert a new one
                idx_to_token = [unknown_token] + idx_to_token
                idx_to_vec = nd.concat(nd.zeros((1, vec_len)), idx_to_vec, dim=0)
            elif unknown_token:
                # The serialized file did define a unknown token, but contains
                # the token that is specified by the user to represent the
                # unknown token.
                assert not deserialized_embedding.unknown_token
                loaded_unknown_vec = True
                # Move unknown_token to idx 0 to replicate the behavior of
                # _load_embedding_text
                unk_idx = idx_to_token.index(unknown_token)
                if unk_idx > 0:
                    idx_to_token[0], idx_to_token[unk_idx] = idx_to_token[unk_idx], idx_to_token[0]
                    idx_to_vec[[0, unk_idx]] = idx_to_vec[[unk_idx, 0]]
            else:
                assert not deserialized_embedding.unknown_token
                assert not unknown_token

        if unknown_token and init_unknown_vec and not loaded_unknown_vec:
            unk_idx = idx_to_token.index(unknown_token)
            idx_to_vec[unk_idx] = init_unknown_vec(shape=vec_len)

        return idx_to_token, idx_to_vec, unknown_token

    @property
    def idx_to_token(self):
        """Index to token mapping.

        Returns
        -------
        list of str:
             A list of indexed tokens where the list indices and the token
             indices are aligned.

        """
        return self._idx_to_token

    @property
    def token_to_idx(self):
        """Token to index mapping.

        Returns
        -------
        dict of int to strs:
             A dictionary of tokens with their corresponding index numbers;
             inverse vocab.
        """
        return self._token_to_idx

    @property
    def idx_to_vec(self):
        """Index to vector mapping.

        Returns
        -------
        mxnet.ndarray.NDArray:
            For all the indexed tokens in this embedding, this NDArray maps
            each token's index to an embedding vector.

        """
        return self._idx_to_vec

    @property
    def unknown_token(self):
        """Unknown token representation.

        Any token that is unknown will be indexed using the representation of
        unknown_token.

        Returns
        -------
        hashable object or None:
            Unknown token representation

        """
        return self._unknown_token

    @property
    def allow_extend(self):
        """Allow extension of the TokenEmbedding with new tokens.

        If True, `TokenEmbedding[tokens] = vec` can introduce new tokens that
        were previously unknown. New indices will be assigned to the newly
        introduced tokens. If False, only known tokens can be updated.

        Returns
        -------
        bool:
            Extension of the TokenEmbedding is allowed.

        """
        return self._allow_extend

    @property
    def unknown_lookup(self):
        """Vector lookup for unknown tokens.

        If not None, unknown_lookup[tokens] is automatically called for any
        unknown tokens.

        Returns
        -------
        Mapping[List[str], nd.NDarray]
            Vector lookup mapping from tokens to vectors.

        """
        return self._unknown_lookup

    @unknown_lookup.setter
    def unknown_lookup(self, unknown_lookup):
        """Vector lookup for unknown tokens.

        If not None, unknown_lookup[tokens] is called for any unknown tokens.

        Parameters
        ----------
        unknown_lookup : Mapping[List[str], nd.NDarray]
            Vector lookup mapping from tokens to vectors.

        """
        self._unknown_lookup = unknown_lookup

[docs]    def __contains__(self, token):
        """Check if token is known.

        Parameters
        ----------
        token : str
            A token.

        Returns
        -------
        bool:
            Return True if the token is known. A token is known if it has been
            assigned an index and vector.
        """
        return token in self._token_to_idx

    def __eq__(self, other):
        if isinstance(other, TokenEmbedding):
            return self.unknown_token == other.unknown_token \
                and self.idx_to_token == other.idx_to_token and \
                ((self.idx_to_vec == other.idx_to_vec).min().asscalar() == 1) \
                and (self._token_to_idx == other._token_to_idx)
        else:
            return NotImplemented

    def __ne__(self, other):
        result = self.__eq__(other)
        if result is NotImplemented:
            return NotImplemented
        else:
            return not result

[docs]    def __getitem__(self, tokens):
        """Looks up embedding vectors of text tokens.

        Parameters
        ----------
        tokens : str or list of strs
            A token or a list of tokens.

        Returns
        -------
        mxnet.ndarray.NDArray:
            The embedding vector(s) of the token(s). According to numpy conventions, if `tokens` is
            a string, returns a 1-D NDArray (vector); if `tokens` is a list of
            strings, returns a 2-D NDArray (matrix) of shape=(len(tokens), vec_len).
        """

        to_reduce = not isinstance(tokens, (list, tuple))
        if to_reduce:
            tokens = [tokens]

        if self.unknown_lookup is not None:
            if self.idx_to_vec is None:
                # May raise KeyError, but we cannot fallback to idx_to_vec's
                # unknown vector, as idx_to_vec has not been initialized yet.
                # Cannot initialize it, as we don't know the dimension.
                vecs = self.unknown_lookup[tokens]
            else:
                vecs = [
                    self.idx_to_vec[self.token_to_idx[token]] if
                    (token in self.token_to_idx
                     or token not in self.unknown_lookup) else
                    self.unknown_lookup[token] for token in tokens]
                vecs = nd.stack(*vecs, axis=0)
        else:
            indices = [self._token_to_idx[token] for token in tokens]
            vecs = nd.Embedding(
                nd.array(indices), self.idx_to_vec, self.idx_to_vec.shape[0],
                self.idx_to_vec.shape[1])

        return vecs[0] if to_reduce else vecs

    def _check_vector_update(self, tokens, new_embedding):
        """Check that tokens and embedding are in the format for __setitem__."""
        assert self._idx_to_vec is not None, '`idx_to_vec` has not been initialized.'

        if not isinstance(tokens, (list, tuple)) or len(tokens) == 1:
            assert isinstance(new_embedding, nd.NDArray) and len(new_embedding.shape) in [1, 2], \
                '`new_embedding` must be a 1-D or 2-D NDArray if `tokens` is a single token.'
            if not isinstance(tokens, (list, tuple)):
                tokens = [tokens]
            if len(new_embedding.shape) == 1:
                new_embedding = new_embedding.expand_dims(0)

        else:
            assert isinstance(new_embedding, nd.NDArray) and len(new_embedding.shape) == 2, \
                '`new_embedding` must be a 2-D NDArray if `tokens` is a list of multiple strings.'
        if self._idx_to_vec is not None:
            assert new_embedding.shape == (len(tokens), self._idx_to_vec.shape[1]), \
                'The length of `new_embedding` must be equal to the number ' \
                'of tokens and the width of new_embedding must be equal ' \
                'to the dimension of embedding of the glossary.'
        else:
            assert new_embedding.shape[0] == len(tokens), \
                'The length of `new_embedding` must be equal to the number of tokens'
        return tokens

[docs]    def __setitem__(self, tokens, new_embedding):
        """Updates embedding vectors for tokens.

        If self.allow_extend is True, vectors for previously unknown tokens can be introduced.

        Parameters
        ----------
        tokens : hashable object or a list or tuple of hashable objects
            A token or a list of tokens whose embedding vector are to be updated.
        new_embedding : mxnet.ndarray.NDArray
            An NDArray to be assigned to the embedding vectors of `tokens`. Its length must be equal
            to the number of `tokens` and its width must be equal to the dimension of embedding of
            the glossary. If `tokens` is a singleton, it must be 1-D or 2-D. If `tokens` is a list
            of multiple strings, it must be 2-D.
        """
        if not isinstance(tokens, (list, tuple)):
            tokens = [tokens]
        if ((self.allow_extend or all(t in self.token_to_idx for t in tokens))
                and self._idx_to_vec is None):
            # Initialize self._idx_to_vec
            assert UNK_IDX == 0
            self._idx_to_vec = self._init_unknown_vec(
                shape=(1, new_embedding.shape[-1]))

        tokens = self._check_vector_update(tokens, new_embedding)

        if self.allow_extend:
            # Add new / previously unknown tokens
            len_before = len(self._token_to_idx)
            for token in tokens:
                if token not in self._token_to_idx:
                    idx = len(self._token_to_idx)
                    self._token_to_idx[token] = idx
                    self._idx_to_token.append(token)

            num_extended = len(self._token_to_idx) - len_before
            if num_extended >= 1:
                if num_extended == 1:
                    warnings.warn(
                        'When adding new tokens via TokenEmbedding.__setitem__ '
                        'the internal embedding matrix needs to be reallocated. '
                        'Users are therefore encouraged to batch their updates '
                        '(i.e. add multiple new tokens at a time).')

                # Extend shape of idx_to_vec
                idx_to_vec = nd.zeros(shape=(len(self._token_to_idx),
                                             self.idx_to_vec.shape[1]))
                idx_to_vec[:self.idx_to_vec.shape[0]] = self._idx_to_vec
                self._idx_to_vec = idx_to_vec

        indices = []
        for token in tokens:
            if token in self._token_to_idx:
                indices.append(self._token_to_idx[token])
            else:
                if self.unknown_token:
                    raise KeyError(('Token "{}" is unknown. To update the embedding vector for an'
                                    ' unknown token, please explicitly include "{}" as the '
                                    '`unknown_token` in `tokens`. This is to avoid unintended '
                                    'updates.').format(token, self.unknown_token))
                raise KeyError(('Token "{}" is unknown. Updating the embedding vector for an '
                                'unknown token is not allowed because `unknown_token` is not '
                                'specified.').format(token))

        self._idx_to_vec[nd.array(indices)] = new_embedding

    @classmethod
    def _check_source(cls, source_file_hash, source):
        """Checks if a pre-trained token embedding source name is valid.


        Parameters
        ----------
        source : str
            The pre-trained token embedding source.
        """
        embedding_name = cls.__name__.lower()
        if source not in source_file_hash:
            raise KeyError('Cannot find pre-trained source {source} for token embedding {name}. '
                           'Valid pre-trained file names for embedding {name}: {values}'.format(
                               source=source, name=embedding_name,
                               values=', '.join(source_file_hash.keys())))

[docs]    @staticmethod
    def from_file(file_path, elem_delim=' ', encoding=ENCODING, **kwargs):
        """Creates a user-defined token embedding from a pre-trained embedding file.


        This is to load embedding vectors from a user-defined pre-trained token embedding file.
        For example, if `elem_delim` = ' ', the expected format of a custom pre-trained token
        embedding file may look like:

        'hello 0.1 0.2 0.3 0.4 0.5\\\\nworld 1.1 1.2 1.3 1.4 1.5\\\\n'

        where embedding vectors of words `hello` and `world` are [0.1, 0.2, 0.3, 0.4, 0.5] and
        [1.1, 1.2, 1.3, 1.4, 1.5] respectively.


        Parameters
        ----------
        file_path : str
            The path to the user-defined pre-trained token embedding file.
        elem_delim : str, default ' '
            The delimiter for splitting a token and every embedding vector element value on the same
            line of the custom pre-trained token embedding file.
        encoding : str, default 'utf8'
            The encoding scheme for reading the custom pre-trained token embedding file.
        kwargs : dict
            All other keyword arguments are passed to the TokenEmbedding initializer.


        Returns
        -------
        instance of :class:`gluonnlp.embedding.TokenEmbedding`
            The user-defined token embedding instance.
        """
        unknown_token = kwargs.pop('unknown_token', C.UNK_TOKEN)
        init_unknown_vec = kwargs.pop('init_unknown_vec', INIT_UNKNOWN_VEC)
        idx_to_token, idx_to_vec, unknown_token = TokenEmbedding._load_embedding(
            file_path,
            elem_delim=elem_delim,
            unknown_token=unknown_token,
            init_unknown_vec=init_unknown_vec,
            encoding=encoding)

        assert 'idx_to_vec' not in kwargs
        assert 'idx_to_token' not in kwargs
        return TokenEmbedding(unknown_token=unknown_token,
                              init_unknown_vec=None,
                              idx_to_token=idx_to_token,
                              idx_to_vec=idx_to_vec,
                              **kwargs)

[docs]    def serialize(self, file_path, compress=True):
        """Serializes the TokenEmbedding to a file specified by file_path.

        TokenEmbedding is serialized by converting the list of tokens, the
        array of word embeddings and other metadata to numpy arrays, saving all
        in a single (optionally compressed) Zipfile. See
        https://docs.scipy.org/doc/numpy-1.14.2/neps/npy-format.html for more
        information on the format.


        Parameters
        ----------
        file_path : str or file
            The path at which to create the file holding the serialized
            TokenEmbedding. If file is a string or a Path, the .npz extension
            will be appended to the file name if it is not already there.
        compress : bool, default True
            Compress the Zipfile or leave it uncompressed.

        """
        if self.unknown_lookup is not None:
            warnings.warn(
                'Serialization of `unknown_lookup` is not supported. '
                'Save it manually and pass the loaded lookup object '
                'during deserialization.')

        unknown_token = np.array(self.unknown_token)
        idx_to_token = np.array(self.idx_to_token, dtype='O')
        idx_to_vec = self.idx_to_vec.asnumpy()

        if not unknown_token:  # Store empty string instead of None
            unknown_token = ''

        if not compress:
            np.savez(file=file_path, unknown_token=unknown_token,
                     idx_to_token=idx_to_token, idx_to_vec=idx_to_vec)
        else:
            np.savez_compressed(file=file_path, unknown_token=unknown_token,
                                idx_to_token=idx_to_token,
                                idx_to_vec=idx_to_vec)

[docs]    @staticmethod
    def deserialize(file_path, **kwargs):
        """Create a new TokenEmbedding from a serialized one.

        TokenEmbedding is serialized by converting the list of tokens, the
        array of word embeddings and other metadata to numpy arrays, saving all
        in a single (optionally compressed) Zipfile. See
        https://docs.scipy.org/doc/numpy-1.14.2/neps/npy-format.html for more
        information on the format.


        Parameters
        ----------
        file_path : str or file
            The path to a file that holds the serialized TokenEmbedding.
        kwargs : dict
            Keyword arguments are passed to the TokenEmbedding initializer.
            Useful for attaching unknown_lookup.
        """
        # idx_to_token is of dtype 'O' so we need to allow pickle
        npz_dict = np.load(file_path, allow_pickle=True)

        unknown_token = npz_dict['unknown_token']
        if not unknown_token:
            unknown_token = None
        else:
            if isinstance(unknown_token, np.ndarray):
                if unknown_token.dtype.kind == 'S':
                    unknown_token = unknown_token.tobytes().decode()
                else:
                    unknown_token = str(unknown_token)
        idx_to_token = npz_dict['idx_to_token'].tolist()
        idx_to_vec = nd.array(npz_dict['idx_to_vec'])

        assert 'unknown_token' not in kwargs
        assert 'init_unknown_vec' not in kwargs
        assert 'idx_to_vec' not in kwargs
        assert 'idx_to_token' not in kwargs
        return TokenEmbedding(unknown_token=unknown_token,
                              init_unknown_vec=None,
                              idx_to_token=idx_to_token,
                              idx_to_vec=idx_to_vec,
                              **kwargs)


[docs]@register
class GloVe(TokenEmbedding):
    """The GloVe word embedding.

    GloVe is an unsupervised learning algorithm for obtaining vector representations for words.
    Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and
    the resulting representations showcase interesting linear substructures of the word vector
    space. (Source from https://nlp.stanford.edu/projects/glove/)

    Reference:

    GloVe: Global Vectors for Word Representation.
    Jeffrey Pennington, Richard Socher, and Christopher D. Manning.
    https://nlp.stanford.edu/pubs/glove.pdf

    Website: https://nlp.stanford.edu/projects/glove/

    To get the updated URLs to the externally hosted pre-trained token embedding
    files, visit https://nlp.stanford.edu/projects/glove/

    License for pre-trained embedding: https://opendatacommons.org/licenses/pddl/

    Available sources

    >>> import gluonnlp as nlp
    >>> sorted(nlp.embedding.list_sources('GloVe'))
    [\
'glove.42B.300d', 'glove.6B.100d', 'glove.6B.200d', 'glove.6B.300d', 'glove.6B.50d', \
'glove.840B.300d', 'glove.twitter.27B.100d', 'glove.twitter.27B.200d', \
'glove.twitter.27B.25d', 'glove.twitter.27B.50d'\
]

    Parameters
    ----------
    source : str, default 'glove.6B.50d'
        The name of the pre-trained token embedding file.
    embedding_root : str, default '$MXNET_HOME/embedding'
        The root directory for storing embedding-related files.
        MXNET_HOME defaults to '~/.mxnet'.
    kwargs
        All other keyword arguments are passed to
        `gluonnlp.embedding.TokenEmbedding`.

    Attributes
    ----------
    idx_to_vec : mxnet.ndarray.NDArray
        For all the indexed tokens in this embedding, this NDArray maps each token's index to an
        embedding vector.
    unknown_token : hashable object
        The representation for any unknown token. In other words, any unknown token will be indexed
        as the same representation.
    """

    # Map a pre-trained token embedding file and its SHA-1 hash.
    source_file_hash = C.GLOVE_NPZ_SHA1

    def __init__(self, source='glove.6B.50d',
                 embedding_root=os.path.join(get_home_dir(), 'embedding'), **kwargs):
        self._check_source(self.source_file_hash, source)
        pretrained_file_path = GloVe._get_file_path(self.source_file_hash, embedding_root, source)
        unknown_token = kwargs.pop('unknown_token', C.UNK_TOKEN)
        init_unknown_vec = kwargs.pop('init_unknown_vec', INIT_UNKNOWN_VEC)
        encoding = kwargs.pop('encoding', ENCODING)
        idx_to_token, idx_to_vec, unknown_token = self._load_embedding(
            pretrained_file_path=pretrained_file_path,
            elem_delim=' ',
            unknown_token=unknown_token,
            init_unknown_vec=init_unknown_vec,
            encoding=encoding)

        assert 'idx_to_vec' not in kwargs
        assert 'idx_to_token' not in kwargs
        super(GloVe, self).__init__(unknown_token=unknown_token,
                                    init_unknown_vec=None,
                                    idx_to_token=idx_to_token,
                                    idx_to_vec=idx_to_vec,
                                    **kwargs)


[docs]@register
class FastText(TokenEmbedding):
    """The fastText word embedding.


    FastText is an open-source, free, lightweight library that allows users to learn text
    representations and text classifiers. It works on standard, generic hardware. Models can later
    be reduced in size to even fit on mobile devices. (Source from https://fasttext.cc/)


    References:

    Enriching Word Vectors with Subword Information.
    Piotr Bojanowski, Edouard Grave, Armand Joulin, and Tomas Mikolov.
    https://arxiv.org/abs/1607.04606

    Bag of Tricks for Efficient Text Classification.
    Armand Joulin, Edouard Grave, Piotr Bojanowski, and Tomas Mikolov.
    https://arxiv.org/abs/1607.01759

    FastText.zip: Compressing text classification models.
    Armand Joulin, Edouard Grave, Piotr Bojanowski, Matthijs Douze, Herve Jegou, and Tomas Mikolov.
    https://arxiv.org/abs/1612.03651

    For 'wiki.multi' embedding:
    Word Translation Without Parallel Data
    Alexis Conneau, Guillaume Lample, Marc'Aurelio Ranzato, Ludovic Denoyer, and Herve Jegou.
    https://arxiv.org/abs/1710.04087

    Website: https://fasttext.cc/

    To get the updated URLs to the externally hosted pre-trained token embedding files, visit
    https://github.com/facebookresearch/fastText/blob/master/docs/pretrained-vectors.md

    License for pre-trained embedding: https://creativecommons.org/licenses/by-sa/3.0/

    Available sources

    >>> import gluonnlp as nlp
    >>> sorted(nlp.embedding.list_sources('FastText'))
    [\
'cc.af.300', 'cc.als.300', 'cc.am.300', 'cc.an.300', 'cc.ar.300', 'cc.arz.300', \
'cc.as.300', 'cc.ast.300', 'cc.az.300', 'cc.azb.300', 'cc.ba.300', 'cc.bar.300', \
'cc.bcl.300', 'cc.be.300', 'cc.bg.300', 'cc.bh.300', 'cc.bn.300', 'cc.bo.300', \
'cc.bpy.300', 'cc.br.300', 'cc.bs.300', 'cc.ca.300', 'cc.ce.300', 'cc.ceb.300', \
'cc.ckb.300', 'cc.co.300', 'cc.cs.300', 'cc.cv.300', 'cc.cy.300', 'cc.da.300', \
'cc.de.300', 'cc.diq.300', 'cc.dv.300', 'cc.el.300', 'cc.eml.300', 'cc.en.300', \
'cc.eo.300', 'cc.es.300', 'cc.et.300', 'cc.eu.300', 'cc.fa.300', 'cc.fi.300', \
'cc.fr.300', 'cc.frr.300', 'cc.fy.300', 'cc.ga.300', 'cc.gd.300', 'cc.gl.300', \
'cc.gom.300', 'cc.gu.300', 'cc.gv.300', 'cc.he.300', 'cc.hi.300', 'cc.hif.300', \
'cc.hr.300', 'cc.hsb.300', 'cc.ht.300', 'cc.hu.300', 'cc.hy.300', 'cc.ia.300', \
'cc.id.300', 'cc.ilo.300', 'cc.io.300', 'cc.is.300', 'cc.it.300', 'cc.ja.300', \
'cc.jv.300', 'cc.ka.300', 'cc.kk.300', 'cc.km.300', 'cc.kn.300', 'cc.ko.300', \
'cc.ku.300', 'cc.ky.300', 'cc.la.300', 'cc.lb.300', 'cc.li.300', 'cc.lmo.300', \
'cc.lt.300', 'cc.lv.300', 'cc.mai.300', 'cc.mg.300', 'cc.mhr.300', 'cc.min.300', \
'cc.mk.300', 'cc.ml.300', 'cc.mn.300', 'cc.mr.300', 'cc.mrj.300', 'cc.ms.300', \
'cc.mt.300', 'cc.mwl.300', 'cc.my.300', 'cc.myv.300', 'cc.mzn.300', 'cc.nah.300', \
'cc.nap.300', 'cc.nds.300', 'cc.ne.300', 'cc.new.300', 'cc.nl.300', 'cc.nn.300', \
'cc.no.300', 'cc.nso.300', 'cc.oc.300', 'cc.or.300', 'cc.os.300', 'cc.pa.300', \
'cc.pam.300', 'cc.pfl.300', 'cc.pl.300', 'cc.pms.300', 'cc.pnb.300', 'cc.ps.300', \
'cc.pt.300', 'cc.qu.300', 'cc.rm.300', 'cc.ro.300', 'cc.ru.300', 'cc.sa.300', \
'cc.sah.300', 'cc.sc.300', 'cc.scn.300', 'cc.sco.300', 'cc.sd.300', 'cc.sh.300', \
'cc.si.300', 'cc.sk.300', 'cc.sl.300', 'cc.so.300', 'cc.sq.300', 'cc.sr.300', \
'cc.su.300', 'cc.sv.300', 'cc.sw.300', 'cc.ta.300', 'cc.te.300', 'cc.tg.300', \
'cc.th.300', 'cc.tk.300', 'cc.tl.300', 'cc.tr.300', 'cc.tt.300', 'cc.ug.300', \
'cc.uk.300', 'cc.ur.300', 'cc.uz.300', 'cc.vec.300', 'cc.vi.300', 'cc.vls.300', \
'cc.vo.300', 'cc.wa.300', 'cc.war.300', 'cc.xmf.300', 'cc.yi.300', 'cc.yo.300', \
'cc.zea.300', 'cc.zh.300', 'crawl-300d-2M', 'crawl-300d-2M-subword', \
'wiki-news-300d-1M', 'wiki-news-300d-1M-subword', 'wiki.aa', 'wiki.ab', 'wiki.ace', \
'wiki.ady', 'wiki.af', 'wiki.ak', 'wiki.als', 'wiki.am', 'wiki.an', 'wiki.ang', \
'wiki.ar', 'wiki.arc', 'wiki.arz', 'wiki.as', 'wiki.ast', 'wiki.av', 'wiki.ay', \
'wiki.az', 'wiki.azb', 'wiki.ba', 'wiki.bar', 'wiki.bat_smg', 'wiki.bcl', 'wiki.be', \
'wiki.bg', 'wiki.bh', 'wiki.bi', 'wiki.bjn', 'wiki.bm', 'wiki.bn', 'wiki.bo', \
'wiki.bpy', 'wiki.br', 'wiki.bs', 'wiki.bug', 'wiki.bxr', 'wiki.ca', 'wiki.cbk_zam', \
'wiki.cdo', 'wiki.ce', 'wiki.ceb', 'wiki.ch', 'wiki.cho', 'wiki.chr', 'wiki.chy', \
'wiki.ckb', 'wiki.co', 'wiki.cr', 'wiki.crh', 'wiki.cs', 'wiki.csb', 'wiki.cu', \
'wiki.cv', 'wiki.cy', 'wiki.da', 'wiki.de', 'wiki.diq', 'wiki.dsb', 'wiki.dv', \
'wiki.dz', 'wiki.ee', 'wiki.el', 'wiki.eml', 'wiki.en', 'wiki.eo', 'wiki.es', \
'wiki.et', 'wiki.eu', 'wiki.ext', 'wiki.fa', 'wiki.ff', 'wiki.fi', 'wiki.fiu_vro', \
'wiki.fj', 'wiki.fo', 'wiki.fr', 'wiki.frp', 'wiki.frr', 'wiki.fur', 'wiki.fy', \
'wiki.ga', 'wiki.gag', 'wiki.gan', 'wiki.gd', 'wiki.gl', 'wiki.glk', 'wiki.gn', \
'wiki.gom', 'wiki.got', 'wiki.gu', 'wiki.gv', 'wiki.ha', 'wiki.hak', 'wiki.haw', \
'wiki.he', 'wiki.hi', 'wiki.hif', 'wiki.ho', 'wiki.hr', 'wiki.hsb', 'wiki.ht', \
'wiki.hu', 'wiki.hy', 'wiki.hz', 'wiki.ia', 'wiki.id', 'wiki.ie', 'wiki.ig', \
'wiki.ii', 'wiki.ik', 'wiki.ilo', 'wiki.io', 'wiki.is', 'wiki.it', 'wiki.iu', \
'wiki.ja', 'wiki.jam', 'wiki.jbo', 'wiki.jv', 'wiki.ka', 'wiki.kaa', 'wiki.kab', \
'wiki.kbd', 'wiki.kg', 'wiki.ki', 'wiki.kj', 'wiki.kk', 'wiki.kl', 'wiki.km', \
'wiki.kn', 'wiki.ko', 'wiki.koi', 'wiki.kr', 'wiki.krc', 'wiki.ks', 'wiki.ksh', \
'wiki.ku', 'wiki.kv', 'wiki.kw', 'wiki.ky', 'wiki.la', 'wiki.lad', 'wiki.lb', \
'wiki.lbe', 'wiki.lez', 'wiki.lg', 'wiki.li', 'wiki.lij', 'wiki.lmo', 'wiki.ln', \
'wiki.lo', 'wiki.lrc', 'wiki.lt', 'wiki.ltg', 'wiki.lv', 'wiki.mai', 'wiki.map_bms', \
'wiki.mdf', 'wiki.mg', 'wiki.mh', 'wiki.mhr', 'wiki.mi', 'wiki.min', 'wiki.mk', \
'wiki.ml', 'wiki.mn', 'wiki.mo', 'wiki.mr', 'wiki.mrj', 'wiki.ms', 'wiki.mt', \
'wiki.multi.ar', 'wiki.multi.bg', 'wiki.multi.ca', 'wiki.multi.cs', 'wiki.multi.da', \
'wiki.multi.de', 'wiki.multi.el', 'wiki.multi.en', 'wiki.multi.es', 'wiki.multi.et', \
'wiki.multi.fi', 'wiki.multi.fr', 'wiki.multi.he', 'wiki.multi.hr', 'wiki.multi.hu', \
'wiki.multi.id', 'wiki.multi.it', 'wiki.multi.mk', 'wiki.multi.nl', 'wiki.multi.no', \
'wiki.multi.pl', 'wiki.multi.pt', 'wiki.multi.ro', 'wiki.multi.ru', 'wiki.multi.sk', \
'wiki.multi.sl', 'wiki.multi.sv', 'wiki.multi.tr', 'wiki.multi.uk', 'wiki.multi.vi', \
'wiki.mus', 'wiki.mwl', 'wiki.my', 'wiki.myv', 'wiki.mzn', 'wiki.na', 'wiki.nah', \
'wiki.nap', 'wiki.nds', 'wiki.nds_nl', 'wiki.ne', 'wiki.new', 'wiki.ng', 'wiki.nl', \
'wiki.nn', 'wiki.no', 'wiki.nov', 'wiki.nrm', 'wiki.nso', 'wiki.nv', 'wiki.ny', \
'wiki.oc', 'wiki.olo', 'wiki.om', 'wiki.or', 'wiki.os', 'wiki.pa', 'wiki.pag', \
'wiki.pam', 'wiki.pap', 'wiki.pcd', 'wiki.pdc', 'wiki.pfl', 'wiki.pi', 'wiki.pih', \
'wiki.pl', 'wiki.pms', 'wiki.pnb', 'wiki.pnt', 'wiki.ps', 'wiki.pt', 'wiki.qu', \
'wiki.rm', 'wiki.rmy', 'wiki.rn', 'wiki.ro', 'wiki.roa_rup', 'wiki.roa_tara', \
'wiki.ru', 'wiki.rue', 'wiki.rw', 'wiki.sa', 'wiki.sah', 'wiki.sc', 'wiki.scn', \
'wiki.sco', 'wiki.sd', 'wiki.se', 'wiki.sg', 'wiki.sh', 'wiki.si', 'wiki.simple', \
'wiki.sk', 'wiki.sl', 'wiki.sm', 'wiki.sn', 'wiki.so', 'wiki.sq', 'wiki.sr', \
'wiki.srn', 'wiki.ss', 'wiki.st', 'wiki.stq', 'wiki.su', 'wiki.sv', 'wiki.sw', \
'wiki.szl', 'wiki.ta', 'wiki.tcy', 'wiki.te', 'wiki.tet', 'wiki.tg', 'wiki.th', \
'wiki.ti', 'wiki.tk', 'wiki.tl', 'wiki.tn', 'wiki.to', 'wiki.tpi', 'wiki.tr', \
'wiki.ts', 'wiki.tt', 'wiki.tum', 'wiki.tw', 'wiki.ty', 'wiki.tyv', 'wiki.udm', \
'wiki.ug', 'wiki.uk', 'wiki.ur', 'wiki.uz', 'wiki.ve', 'wiki.vec', 'wiki.vep', \
'wiki.vi', 'wiki.vls', 'wiki.vo', 'wiki.wa', 'wiki.war', 'wiki.wo', 'wiki.wuu', \
'wiki.xal', 'wiki.xh', 'wiki.xmf', 'wiki.yi', 'wiki.yo', 'wiki.za', 'wiki.zea', \
'wiki.zh', 'wiki.zh_classical', 'wiki.zh_min_nan', 'wiki.zh_yue', 'wiki.zu'\
]


    Parameters
    ----------
    source : str, default 'wiki.simple'
        The name of the pre-trained token embedding file.
    embedding_root : str, default '$MXNET_HOME/embedding'
        The root directory for storing embedding-related files.
        MXNET_HOME defaults to '~/.mxnet'.
    load_ngrams : bool, default False
        Load vectors for ngrams so that computing vectors for OOV words is
        possible. This is disabled by default as it requires downloading an
        additional 2GB file containing the vectors for ngrams. Note that
        facebookresearch did not publish ngram vectors for all their models. If
        load_ngrams is True, but no ngram vectors are available for the chosen
        source this a RuntimeError is thrown. The ngram vectors are passed to
        the resulting TokenEmbedding as `unknown_lookup`.
    ctx : mx.Context, default mxnet.cpu()
        Context to load the FasttextEmbeddingModel for ngram vectors to. This
        parameter is ignored if load_ngrams is False.
    kwargs
        All other keyword arguments are passed to
        `gluonnlp.embedding.TokenEmbedding`.


    Attributes
    ----------
    idx_to_vec : mxnet.ndarray.NDArray
        For all the indexed tokens in this embedding, this NDArray maps each token's index to an
        embedding vector.
    unknown_token : hashable object
        The representation for any unknown token. In other words, any unknown token will be indexed
        as the same representation.
    """

    # Map a pre-trained token embedding file and its SHA-1 hash.
    source_file_hash = C.FAST_TEXT_NPZ_SHA1
    source_bin_file_hash = C.FAST_TEXT_BIN_SHA1

    def __init__(self, source='wiki.simple', embedding_root=os.path.join(
            get_home_dir(), 'embedding'), load_ngrams=False, ctx=cpu(), **kwargs):
        self._check_source(self.source_file_hash, source)
        pretrained_file_path = FastText._get_file_path(self.source_file_hash,
                                                       embedding_root, source)
        if load_ngrams:
            try:
                self._check_source(self.source_bin_file_hash, source)
            except KeyError:
                raise KeyError(
                    'No ngrams are available for {}. '
                    'Ngram features were published for the following embeddings: {}'.
                    format(source, ', '.join(self.source_bin_file_hash.keys())))

            pretrained_bin_file_path = FastText._get_file_path(self.source_bin_file_hash,
                                                               embedding_root, source)
            unknown_lookup = FasttextEmbeddingModel.load_fasttext_format(
                pretrained_bin_file_path, ctx=ctx)
        else:
            unknown_lookup = None

        unknown_token = kwargs.pop('unknown_token', C.UNK_TOKEN)
        init_unknown_vec = kwargs.pop('init_unknown_vec', INIT_UNKNOWN_VEC)
        encoding = kwargs.pop('encoding', ENCODING)
        idx_to_token, idx_to_vec, unknown_token = self._load_embedding(
            pretrained_file_path=pretrained_file_path,
            elem_delim=' ',
            unknown_token=unknown_token,
            init_unknown_vec=init_unknown_vec,
            encoding=encoding)

        assert 'idx_to_vec' not in kwargs
        assert 'idx_to_token' not in kwargs
        super(FastText, self).__init__(unknown_token=unknown_token,
                                       init_unknown_vec=None,
                                       idx_to_token=idx_to_token,
                                       idx_to_vec=idx_to_vec,
                                       unknown_lookup=unknown_lookup, **kwargs)


[docs]@register
class Word2Vec(TokenEmbedding):
    """The Word2Vec word embedding.

    Word2Vec is an unsupervised learning algorithm for obtaining vector
    representations for words. Training is performed with continuous
    bag-of-words or skip-gram architecture for computing vector
    representations of words.

    References:

    [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient
    Estimation of Word Representations in Vector Space. In Proceedings of
    Workshop at ICLR, 2013.

    [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey
    Dean. Distributed Representations of Words and Phrases and their
    Compositionality. In Proceedings of NIPS, 2013.

    [3] Tomas Mikolov, Wen-tau Yih, and Geoffrey Zweig. Linguistic Regularities
    in Continuous Space Word Representations. In Proceedings of NAACL HLT,
    2013.

    Website: https://code.google.com/archive/p/word2vec/

    License for pre-trained embedding: Unspecified

    Available sources

    >>> import gluonnlp as nlp
    >>> sorted(nlp.embedding.list_sources('Word2Vec'))
    [\
'GoogleNews-vectors-negative300', 'freebase-vectors-skipgram1000', \
'freebase-vectors-skipgram1000-en'\
]

    Parameters
    ----------
    source : str, default 'GoogleNews-vectors-negative300'
        The name of the pre-trained token embedding file.
        A binary pre-trained file outside from the source list can be used for this constructor by
        passing the path to it which ends with .bin as file extension name.
    embedding_root : str, default '$MXNET_HOME/embedding'
        The root directory for storing embedding-related files.
        MXNET_HOME defaults to '~/.mxnet'.
    kwargs
        All other keyword arguments are passed to
        `gluonnlp.embedding.TokenEmbedding`.

    Attributes
    ----------
    idx_to_vec : mxnet.ndarray.NDArray
        For all the indexed tokens in this embedding, this NDArray maps each token's index to an
        embedding vector.
    unknown_token : hashable object
        The representation for any unknown token. In other words, any unknown token will be indexed
        as the same representation.

    """

    # Map a pre-trained token embedding file and its SHA-1 hash.
    source_file_hash = C.WORD2VEC_NPZ_SHA1

    def __init__(self, source='GoogleNews-vectors-negative300',
                 embedding_root=os.path.join(get_home_dir(), 'embedding'), encoding=ENCODING,
                 **kwargs):
        unknown_token = kwargs.pop('unknown_token', C.UNK_TOKEN)
        init_unknown_vec = kwargs.pop('init_unknown_vec', INIT_UNKNOWN_VEC)
        if source.endswith('.bin'):
            pretrained_file_path = os.path.expanduser(source)
            idx_to_token, idx_to_vec, unknown_token = self._load_w2v_binary(
                pretrained_file_path, unknown_token=unknown_token,
                init_unknown_vec=init_unknown_vec, encoding=encoding)
        else:
            self._check_source(self.source_file_hash, source)
            pretrained_file_path = self._get_file_path(self.source_file_hash,
                                                       embedding_root, source)
            idx_to_token, idx_to_vec, unknown_token = self._load_embedding(
                pretrained_file_path=pretrained_file_path,
                elem_delim=' ',
                unknown_token=unknown_token,
                init_unknown_vec=init_unknown_vec,
                encoding=encoding)

        assert 'idx_to_vec' not in kwargs
        assert 'idx_to_token' not in kwargs
        super(Word2Vec, self).__init__(unknown_token=unknown_token,
                                       init_unknown_vec=None,
                                       idx_to_token=idx_to_token,
                                       idx_to_vec=idx_to_vec,
                                       **kwargs)

    @classmethod
    def _load_w2v_binary(cls, pretrained_file_path, unknown_token,
                         init_unknown_vec=INIT_UNKNOWN_VEC, encoding=ENCODING):
        """Load embedding vectors from a binary pre-trained token embedding file.

        Parameters
        ----------
        pretrained_file_path: str
            The path to a binary pre-trained token embedding file end with .bin as file extension
            name.
        encoding: str
            The encoding type of the file.
        """
        idx_to_token = [unknown_token] if unknown_token else []
        unk_idx = None
        if unknown_token:
            unk_idx = 0

        all_elems = []
        tokens = set()
        loaded_unknown_vec = None
        pretrained_file_path = os.path.expanduser(pretrained_file_path)
        with io.open(pretrained_file_path, 'rb') as f:
            header = f.readline().decode(encoding=encoding)  # pytype: disable=attribute-error
            vocab_size, vec_len = (int(x) for x in header.split())
            if unknown_token:
                # Reserve a vector slot for the unknown token at the very beggining
                # because the unknown token index is 0.
                all_elems.extend([0] * vec_len)
            binary_len = np.dtype(np.float32).itemsize * vec_len
            for line_num in range(vocab_size):
                token = []
                while True:
                    ch = f.read(1)
                    if ch == b' ':
                        break
                    if ch == b'':
                        raise EOFError('unexpected end of input; is count incorrect or file '
                                       'otherwise damaged?')
                    if ch != b'\n':  # ignore newlines in front of words (some binary files have)
                        token.append(ch)
                try:
                    token = b''.join(token).decode(encoding=encoding)
                except ValueError:
                    warnings.warn('line {} in {}: failed to decode. Skipping.'
                                  .format(line_num, pretrained_file_path))
                    continue
                elems = np.frombuffer(f.read(binary_len), dtype=np.float32)

                assert len(elems) > 1, 'line {} in {}: unexpected data format.'.format(
                    line_num, pretrained_file_path)

                if token == unknown_token and loaded_unknown_vec is None:
                    loaded_unknown_vec = elems
                    tokens.add(unknown_token)
                elif token in tokens:
                    warnings.warn('line {} in {}: duplicate embedding found for '
                                  'token "{}". Skipped.'.format(line_num, pretrained_file_path,
                                                                token))
                else:
                    assert len(elems) == vec_len, \
                        'line {} in {}: found vector of inconsistent dimension for token ' \
                        '"{}". expected dim: {}, found: {}'.format(line_num,
                                                                   pretrained_file_path,
                                                                   token, vec_len, len(elems))
                    all_elems.extend(elems)
                    idx_to_token.append(token)
                    tokens.add(token)

        idx_to_vec = nd.array(all_elems).reshape((-1, vec_len))

        if unknown_token:
            if loaded_unknown_vec is None:
                idx_to_vec[unk_idx] = init_unknown_vec(shape=vec_len)
            else:
                idx_to_vec[unk_idx] = nd.array(loaded_unknown_vec)

        return idx_to_token, idx_to_vec, unknown_token

[docs]    @classmethod
    def from_w2v_binary(cls, pretrained_file_path, encoding=ENCODING):
        """Load embedding vectors from a binary pre-trained token embedding file.

        Parameters
        ----------
        pretrained_file_path: str
            The path to a binary pre-trained token embedding file end with .bin as file extension
            name.
        encoding: str
            The encoding type of the file.
        """
        return cls(source=pretrained_file_path, encoding=encoding)