Source code for gluonnlp.embedding.token_embedding

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# pylint: disable=consider-iterating-dictionary, too-many-lines
"""Text token embedding."""

__all__ = [
    'register', 'create', 'list_sources', 'TokenEmbedding', 'GloVe',
    'FastText', 'Word2Vec'
]

import io
import logging
import os
import warnings

import numpy as np
from mxnet import nd, registry, cpu
from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url

from .. import _constants as C
from ..base import get_home_dir
from ..data.utils import DefaultLookupDict
from ..model.train import FasttextEmbeddingModel

UNK_IDX = 0
ENCODING = 'utf8'
INIT_UNKNOWN_VEC = nd.zeros

[docs]def register(embedding_cls): """Registers a new token embedding. Once an embedding is registered, we can create an instance of this embedding with :func:`~gluonnlp.embedding.create`. Examples -------- >>> @gluonnlp.embedding.register ... class MyTextEmbed(gluonnlp.embedding.TokenEmbedding): ... def __init__(self, source='my_pretrain_file'): ... pass >>> embed = gluonnlp.embedding.create('MyTextEmbed') >>> print(type(embed)) <class 'gluonnlp.embedding.token_embedding.MyTextEmbed'> """ register_text_embedding = registry.get_register_func(TokenEmbedding, 'token embedding') return register_text_embedding(embedding_cls)
[docs]def create(embedding_name, **kwargs): """Creates an instance of token embedding. Creates a token embedding instance by loading embedding vectors from an externally hosted pre-trained token embedding file, such as those of GloVe and FastText. To get all the valid `embedding_name` and `source`, use :func:`gluonnlp.embedding.list_sources`. Parameters ---------- embedding_name : str The token embedding name (case-insensitive). kwargs : dict All other keyword arguments are passed to the initializer of token embedding class. For example `create(embedding_name='fasttext', source='wiki.simple', load_ngrams=True)` will return `FastText(source='wiki.simple', load_ngrams=True)`. Returns ------- An instance of :class:`gluonnlp.embedding.TokenEmbedding`: A token embedding instance that loads embedding vectors from an externally hosted pre-trained token embedding file. """ create_text_embedding = registry.get_create_func(TokenEmbedding, 'token embedding') return create_text_embedding(embedding_name, **kwargs)
[docs]def list_sources(embedding_name=None): """Get valid token embedding names and their pre-trained file names. To load token embedding vectors from an externally hosted pre-trained token embedding file, such as those of GloVe and FastText, one should use `gluonnlp.embedding.create(embedding_name, source)`. This method returns all the valid names of `source` for the specified `embedding_name`. If `embedding_name` is set to None, this method returns all the valid names of `embedding_name` with their associated `source`. Parameters ---------- embedding_name : str or None, default None The pre-trained token embedding name. Returns ------- dict or list: A list of all the valid pre-trained token embedding file names (`source`) for the specified token embedding name (`embedding_name`). If the text embedding name is set to None, returns a dict mapping each valid token embedding name to a list of valid pre-trained files (`source`). They can be plugged into `gluonnlp.embedding.create(embedding_name, source)`. """ text_embedding_reg = registry.get_registry(TokenEmbedding) if embedding_name is not None: embedding_name = embedding_name.lower() if embedding_name not in text_embedding_reg: raise KeyError('Cannot find `embedding_name` {}. Use ' '`list_sources(embedding_name=None).keys()` to get all the valid' 'embedding names.'.format(embedding_name)) return list(text_embedding_reg[embedding_name].source_file_hash.keys()) else: return {embedding_name: list(embedding_cls.source_file_hash.keys()) for embedding_name, embedding_cls in registry.get_registry(TokenEmbedding).items()}
[docs]class TokenEmbedding: """Token embedding base class. To load token embedding from an externally hosted pre-trained token embedding file, such as those of GloVe and FastText, use :func:`gluonnlp.embedding.create`. To get all the available `embedding_name` and `source`, use :func:`gluonnlp.embedding.list_sources`. Alternatively, to load embedding vectors from a custom pre-trained token embedding file, use :func:`gluonnlp.embedding.TokenEmbedding.from_file`. If `unknown_token` is None, looking up unknown tokens results in KeyError. Otherwise, for every unknown token, if its representation `self.unknown_token` is encountered in the pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained token embedding vector loaded from the file; otherwise, index 0 of `self.idx_to_vec` maps to the token embedding vector initialized by `init_unknown_vec`. If a token is encountered multiple times in the pre-trained token embedding file, only the first-encountered token embedding vector will be loaded and the rest will be skipped. Parameters ---------- unknown_token : hashable object or None, default '<unk>' Any unknown token will be replaced by unknown_token and consequently will be indexed as the same representation. Only used if oov_imputer is not specified. init_unknown_vec : callback, default nd.zeros The callback used to initialize the embedding vector for the unknown token. Only used if `unknown_token` is not None and `idx_to_token` is not None and does not contain `unknown_vec`. allow_extend : bool, default False If True, embedding vectors for previously unknown words can be added via token_embedding[tokens] = vecs. If False, only vectors for known tokens can be updated. unknown_lookup : object subscriptable with list of tokens returning nd.NDarray, default None If not None, the TokenEmbedding obtains embeddings for unknown tokens automatically from `unknown_lookup[unknown_tokens]`. For example, in a FastText model, embeddings for unknown tokens can be computed from the subword information. idx_to_token : list of str or None, default None If not None, a list of tokens for which the `idx_to_vec` argument provides embeddings. The list indices and the indices of `idx_to_vec` must be aligned. If `idx_to_token` is not None, `idx_to_vec` must not be None either. If `idx_to_token` is None, an empty TokenEmbedding object is created. If `allow_extend` is True, tokens and their embeddings can be added to the TokenEmbedding at a later stage. idx_to_vec : mxnet.ndarray.NDArray or None, default None If not None, a NDArray containing embeddings for the tokens specified in `idx_to_token`. The first dimension of `idx_to_vec` must be aligned with `idx_to_token`. If `idx_to_vec` is not None, `idx_to_token` must not be None either. If `idx_to_vec` is None, an empty TokenEmbedding object is created. If `allow_extend` is True, tokens and their embeddings can be added to the TokenEmbedding at a later stage. No copy of the idx_to_vec array is made as long as unknown_token is None or an embedding for unknown_token is specified in `idx_to_vec`. """ def __init__(self, unknown_token=C.UNK_TOKEN, init_unknown_vec=INIT_UNKNOWN_VEC, allow_extend=False, unknown_lookup=None, idx_to_token=None, idx_to_vec=None): unknown_index = None # With pre-specified tokens and vectors if idx_to_vec is not None or idx_to_token is not None: idx_to_token = idx_to_token[:] # Sanity checks if idx_to_vec is None or idx_to_token is None: raise ValueError('Must specify either none or both of ' 'idx_to_token and idx_to_vec.') if idx_to_vec.shape[0] != len(idx_to_token): raise ValueError('idx_to_token and idx_to_vec must contain ' 'the same number of tokens and embeddings respectively.') if unknown_token is not None: try: unknown_index = idx_to_token.index(unknown_token) if init_unknown_vec is not None: logging.info('Ignoring init_unknown_vec as idx_to_vec is specified') except ValueError: if init_unknown_vec is not None: idx_to_token.insert(0, unknown_token) idx_to_vec = nd.concat(init_unknown_vec((1, idx_to_vec.shape[1])), idx_to_vec, dim=0) unknown_index = 0 else: raise ValueError('unknown_token "{}" is not part of idx_to_vec but ' 'init_unknown_vec is None. ' 'You must provide either of them.'.format(unknown_token)) # Initialization self._unknown_token = unknown_token self._init_unknown_vec = init_unknown_vec self._allow_extend = allow_extend self._unknown_lookup = unknown_lookup self._idx_to_token = idx_to_token self._idx_to_vec = idx_to_vec # Empty token-embedding else: # Initialization self._unknown_token = unknown_token if self._unknown_token is not None: unknown_index = UNK_IDX self._init_unknown_vec = init_unknown_vec self._allow_extend = allow_extend self._unknown_lookup = unknown_lookup assert UNK_IDX == 0 self._idx_to_token = [unknown_token] if unknown_token else [] self._idx_to_vec = None # Initialization of token_to_idx mapping if self._unknown_token: assert unknown_index is not None self._token_to_idx = DefaultLookupDict(unknown_index) else: self._token_to_idx = {} self._token_to_idx.update((token, idx) for idx, token in enumerate(self._idx_to_token)) @staticmethod def _get_file_url(cls_name, source_file_hash, source): namespace = 'gluon/embeddings/{}'.format(cls_name) return _get_repo_file_url(namespace, source_file_hash[source][0]) @classmethod def _get_file_path(cls, source_file_hash, embedding_root, source): cls_name = cls.__name__.lower() embedding_root = os.path.expanduser(embedding_root) url = cls._get_file_url(cls_name, source_file_hash, source) embedding_dir = os.path.join(embedding_root, cls_name) pretrained_file_name, expected_file_hash = source_file_hash[source] pretrained_file_path = os.path.join(embedding_dir, pretrained_file_name) if not os.path.exists(pretrained_file_path) \ or not check_sha1(pretrained_file_path, expected_file_hash): print('Embedding file {} is not found. Downloading from Gluon Repository. ' 'This may take some time.'.format(pretrained_file_name)) download(url, pretrained_file_path, sha1_hash=expected_file_hash) return pretrained_file_path @staticmethod def _load_embedding(pretrained_file_path, elem_delim, unknown_token, init_unknown_vec, encoding=ENCODING): """Load embedding vectors from a pre-trained token embedding file. Both text files and TokenEmbedding serialization files are supported. elem_delim and encoding are ignored for non-text files. For every unknown token, if its representation `self.unknown_token` is encountered in the pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained token embedding vector loaded from the file; otherwise, index 0 of `self.idx_to_vec` maps to the text embedding vector initialized by `self._init_unknown_vec`. If a token is encountered multiple times in the pre-trained text embedding file, only the first-encountered token embedding vector will be loaded and the rest will be skipped. """ pretrained_file_path = os.path.expanduser(pretrained_file_path) if not os.path.isfile(pretrained_file_path): raise ValueError('`pretrained_file_path` must be a valid path ' 'to the pre-trained token embedding file.') logging.info('Loading pre-trained token embedding vectors from %s', pretrained_file_path) if pretrained_file_path.endswith('.npz'): return TokenEmbedding._load_embedding_serialized( pretrained_file_path=pretrained_file_path, unknown_token=unknown_token, init_unknown_vec=init_unknown_vec) else: return TokenEmbedding._load_embedding_txt( pretrained_file_path=pretrained_file_path, elem_delim=elem_delim, unknown_token=unknown_token, init_unknown_vec=init_unknown_vec, encoding=encoding) @staticmethod def _load_embedding_txt(pretrained_file_path, elem_delim, unknown_token, init_unknown_vec, encoding=ENCODING): """Load embedding vectors from a pre-trained token embedding file. Returns idx_to_token, idx_to_vec and unknown_token suitable for the TokenEmbedding constructor. For every unknown token, if its representation `unknown_token` is encountered in the pre-trained token embedding file, index 0 of `idx_to_vec` maps to the pre-trained token embedding vector loaded from the file; otherwise, index 0 of `idx_to_vec` maps to the text embedding vector initialized by `init_unknown_vec`. If a token is encountered multiple times in the pre-trained text embedding file, only the first-encountered token embedding vector will be loaded and the rest will be skipped. """ idx_to_token = [unknown_token] if unknown_token else [] unk_idx = None if unknown_token: unk_idx = 0 vec_len = None all_elems = [] tokens = set() loaded_unknown_vec = None with io.open(pretrained_file_path, 'rb') as f: for line_num, line in enumerate(f): try: line = line.decode(encoding) # pytype: disable=attribute-error except ValueError: warnings.warn('line {} in {}: failed to decode. Skipping.' .format(line_num, pretrained_file_path)) continue elems = line.rstrip().split(elem_delim) assert len(elems) > 1, 'line {} in {}: unexpected data format.'.format( line_num, pretrained_file_path) token, elems = elems[0], [float(i) for i in elems[1:]] if loaded_unknown_vec is None and token == unknown_token: loaded_unknown_vec = elems tokens.add(unknown_token) elif token in tokens: warnings.warn('line {} in {}: duplicate embedding found for ' 'token "{}". Skipped.'.format(line_num, pretrained_file_path, token)) elif len(elems) == 1 and line_num == 0: warnings.warn('line {} in {}: skipped likely header line.' .format(line_num, pretrained_file_path)) else: if not vec_len: vec_len = len(elems) if unknown_token: # Reserve a vector slot for the unknown token at the very beggining # because the unknown token index is 0. assert len(all_elems) == 0 all_elems.extend([0] * vec_len) else: assert len(elems) == vec_len, \ 'line {} in {}: found vector of inconsistent dimension for token ' \ '"{}". expected dim: {}, found: {}'.format(line_num, pretrained_file_path, token, vec_len, len(elems)) all_elems.extend(elems) idx_to_token.append(token) tokens.add(token) idx_to_vec = nd.array(all_elems).reshape((-1, vec_len)) if unknown_token: if loaded_unknown_vec is None: idx_to_vec[unk_idx] = init_unknown_vec(shape=vec_len) else: idx_to_vec[unk_idx] = nd.array(loaded_unknown_vec) return idx_to_token, idx_to_vec, unknown_token @staticmethod def _load_embedding_serialized(pretrained_file_path, unknown_token, init_unknown_vec): """Load embedding vectors from a pre-trained token embedding file. Returns idx_to_token, idx_to_vec and unknown_token suitable for the TokenEmbedding constructor. ValueError is raised if a token occurs multiple times. """ deserialized_embedding = TokenEmbedding.deserialize(pretrained_file_path) idx_to_token = deserialized_embedding.idx_to_token if len(set(idx_to_token)) != len(idx_to_token): raise ValueError('Serialized embedding contains duplicate tokens.') idx_to_vec = deserialized_embedding.idx_to_vec vec_len = idx_to_vec.shape[1] loaded_unknown_vec = False if deserialized_embedding.unknown_token: if not unknown_token: # If the TokenEmbedding shall not have an unknown token but the # serialized file provided one, delete the provided one. unk_idx = deserialized_embedding.token_to_idx[ deserialized_embedding.unknown_token] assert unk_idx >= 0 if unk_idx == 0: idx_to_token = idx_to_token[1:] idx_to_vec = idx_to_vec[1:] else: idx_to_token = idx_to_token[:unk_idx] + idx_to_token[unk_idx + 1:] idx_to_vec = nd.concat(idx_to_vec[:unk_idx], idx_to_vec[unk_idx + 1:], dim=0) else: # If the TokenEmbedding shall have an unknown token and the # serialized file provided one, replace the representation. unk_idx = deserialized_embedding.token_to_idx[ deserialized_embedding.unknown_token] idx_to_token[unk_idx] = unknown_token loaded_unknown_vec = True else: if unknown_token and unknown_token not in idx_to_token: # If the TokenEmbedding shall have an unknown token but the # serialized file didn't provided one, insert a new one idx_to_token = [unknown_token] + idx_to_token idx_to_vec = nd.concat(nd.zeros((1, vec_len)), idx_to_vec, dim=0) elif unknown_token: # The serialized file did define a unknown token, but contains # the token that is specified by the user to represent the # unknown token. assert not deserialized_embedding.unknown_token loaded_unknown_vec = True # Move unknown_token to idx 0 to replicate the behavior of # _load_embedding_text unk_idx = idx_to_token.index(unknown_token) if unk_idx > 0: idx_to_token[0], idx_to_token[unk_idx] = idx_to_token[unk_idx], idx_to_token[0] idx_to_vec[[0, unk_idx]] = idx_to_vec[[unk_idx, 0]] else: assert not deserialized_embedding.unknown_token assert not unknown_token if unknown_token and init_unknown_vec and not loaded_unknown_vec: unk_idx = idx_to_token.index(unknown_token) idx_to_vec[unk_idx] = init_unknown_vec(shape=vec_len) return idx_to_token, idx_to_vec, unknown_token @property def idx_to_token(self): """Index to token mapping. Returns ------- list of str: A list of indexed tokens where the list indices and the token indices are aligned. """ return self._idx_to_token @property def token_to_idx(self): """Token to index mapping. Returns ------- dict of int to strs: A dictionary of tokens with their corresponding index numbers; inverse vocab. """ return self._token_to_idx @property def idx_to_vec(self): """Index to vector mapping. Returns ------- mxnet.ndarray.NDArray: For all the indexed tokens in this embedding, this NDArray maps each token's index to an embedding vector. """ return self._idx_to_vec @property def unknown_token(self): """Unknown token representation. Any token that is unknown will be indexed using the representation of unknown_token. Returns ------- hashable object or None: Unknown token representation """ return self._unknown_token @property def allow_extend(self): """Allow extension of the TokenEmbedding with new tokens. If True, `TokenEmbedding[tokens] = vec` can introduce new tokens that were previously unknown. New indices will be assigned to the newly introduced tokens. If False, only known tokens can be updated. Returns ------- bool: Extension of the TokenEmbedding is allowed. """ return self._allow_extend @property def unknown_lookup(self): """Vector lookup for unknown tokens. If not None, unknown_lookup[tokens] is automatically called for any unknown tokens. Returns ------- Mapping[List[str], nd.NDarray] Vector lookup mapping from tokens to vectors. """ return self._unknown_lookup @unknown_lookup.setter def unknown_lookup(self, unknown_lookup): """Vector lookup for unknown tokens. If not None, unknown_lookup[tokens] is called for any unknown tokens. Parameters ---------- unknown_lookup : Mapping[List[str], nd.NDarray] Vector lookup mapping from tokens to vectors. """ self._unknown_lookup = unknown_lookup
[docs] def __contains__(self, token): """Check if token is known. Parameters ---------- token : str A token. Returns ------- bool: Return True if the token is known. A token is known if it has been assigned an index and vector. """ return token in self._token_to_idx
def __eq__(self, other): if isinstance(other, TokenEmbedding): return self.unknown_token == other.unknown_token \ and self.idx_to_token == other.idx_to_token and \ ((self.idx_to_vec == other.idx_to_vec).min().asscalar() == 1) \ and (self._token_to_idx == other._token_to_idx) else: return NotImplemented def __ne__(self, other): result = self.__eq__(other) if result is NotImplemented: return NotImplemented else: return not result
[docs] def __getitem__(self, tokens): """Looks up embedding vectors of text tokens. Parameters ---------- tokens : str or list of strs A token or a list of tokens. Returns ------- mxnet.ndarray.NDArray: The embedding vector(s) of the token(s). According to numpy conventions, if `tokens` is a string, returns a 1-D NDArray (vector); if `tokens` is a list of strings, returns a 2-D NDArray (matrix) of shape=(len(tokens), vec_len). """ to_reduce = not isinstance(tokens, (list, tuple)) if to_reduce: tokens = [tokens] if self.unknown_lookup is not None: if self.idx_to_vec is None: # May raise KeyError, but we cannot fallback to idx_to_vec's # unknown vector, as idx_to_vec has not been initialized yet. # Cannot initialize it, as we don't know the dimension. vecs = self.unknown_lookup[tokens] else: vecs = [ self.idx_to_vec[self.token_to_idx[token]] if (token in self.token_to_idx or token not in self.unknown_lookup) else self.unknown_lookup[token] for token in tokens] vecs = nd.stack(*vecs, axis=0) else: indices = [self._token_to_idx[token] for token in tokens] vecs = nd.Embedding( nd.array(indices), self.idx_to_vec, self.idx_to_vec.shape[0], self.idx_to_vec.shape[1]) return vecs[0] if to_reduce else vecs
def _check_vector_update(self, tokens, new_embedding): """Check that tokens and embedding are in the format for __setitem__.""" assert self._idx_to_vec is not None, '`idx_to_vec` has not been initialized.' if not isinstance(tokens, (list, tuple)) or len(tokens) == 1: assert isinstance(new_embedding, nd.NDArray) and len(new_embedding.shape) in [1, 2], \ '`new_embedding` must be a 1-D or 2-D NDArray if `tokens` is a single token.' if not isinstance(tokens, (list, tuple)): tokens = [tokens] if len(new_embedding.shape) == 1: new_embedding = new_embedding.expand_dims(0) else: assert isinstance(new_embedding, nd.NDArray) and len(new_embedding.shape) == 2, \ '`new_embedding` must be a 2-D NDArray if `tokens` is a list of multiple strings.' if self._idx_to_vec is not None: assert new_embedding.shape == (len(tokens), self._idx_to_vec.shape[1]), \ 'The length of `new_embedding` must be equal to the number ' \ 'of tokens and the width of new_embedding must be equal ' \ 'to the dimension of embedding of the glossary.' else: assert new_embedding.shape[0] == len(tokens), \ 'The length of `new_embedding` must be equal to the number of tokens' return tokens
[docs] def __setitem__(self, tokens, new_embedding): """Updates embedding vectors for tokens. If self.allow_extend is True, vectors for previously unknown tokens can be introduced. Parameters ---------- tokens : hashable object or a list or tuple of hashable objects A token or a list of tokens whose embedding vector are to be updated. new_embedding : mxnet.ndarray.NDArray An NDArray to be assigned to the embedding vectors of `tokens`. Its length must be equal to the number of `tokens` and its width must be equal to the dimension of embedding of the glossary. If `tokens` is a singleton, it must be 1-D or 2-D. If `tokens` is a list of multiple strings, it must be 2-D. """ if not isinstance(tokens, (list, tuple)): tokens = [tokens] if ((self.allow_extend or all(t in self.token_to_idx for t in tokens)) and self._idx_to_vec is None): # Initialize self._idx_to_vec assert UNK_IDX == 0 self._idx_to_vec = self._init_unknown_vec( shape=(1, new_embedding.shape[-1])) tokens = self._check_vector_update(tokens, new_embedding) if self.allow_extend: # Add new / previously unknown tokens len_before = len(self._token_to_idx) for token in tokens: if token not in self._token_to_idx: idx = len(self._token_to_idx) self._token_to_idx[token] = idx self._idx_to_token.append(token) num_extended = len(self._token_to_idx) - len_before if num_extended >= 1: if num_extended == 1: warnings.warn( 'When adding new tokens via TokenEmbedding.__setitem__ ' 'the internal embedding matrix needs to be reallocated. ' 'Users are therefore encouraged to batch their updates ' '(i.e. add multiple new tokens at a time).') # Extend shape of idx_to_vec idx_to_vec = nd.zeros(shape=(len(self._token_to_idx), self.idx_to_vec.shape[1])) idx_to_vec[:self.idx_to_vec.shape[0]] = self._idx_to_vec self._idx_to_vec = idx_to_vec indices = [] for token in tokens: if token in self._token_to_idx: indices.append(self._token_to_idx[token]) else: if self.unknown_token: raise KeyError(('Token "{}" is unknown. To update the embedding vector for an' ' unknown token, please explicitly include "{}" as the ' '`unknown_token` in `tokens`. This is to avoid unintended ' 'updates.').format(token, self.unknown_token)) raise KeyError(('Token "{}" is unknown. Updating the embedding vector for an ' 'unknown token is not allowed because `unknown_token` is not ' 'specified.').format(token)) self._idx_to_vec[nd.array(indices)] = new_embedding
@classmethod def _check_source(cls, source_file_hash, source): """Checks if a pre-trained token embedding source name is valid. Parameters ---------- source : str The pre-trained token embedding source. """ embedding_name = cls.__name__.lower() if source not in source_file_hash: raise KeyError('Cannot find pre-trained source {source} for token embedding {name}. ' 'Valid pre-trained file names for embedding {name}: {values}'.format( source=source, name=embedding_name, values=', '.join(source_file_hash.keys())))
[docs] @staticmethod def from_file(file_path, elem_delim=' ', encoding=ENCODING, **kwargs): """Creates a user-defined token embedding from a pre-trained embedding file. This is to load embedding vectors from a user-defined pre-trained token embedding file. For example, if `elem_delim` = ' ', the expected format of a custom pre-trained token embedding file may look like: 'hello 0.1 0.2 0.3 0.4 0.5\\\\nworld 1.1 1.2 1.3 1.4 1.5\\\\n' where embedding vectors of words `hello` and `world` are [0.1, 0.2, 0.3, 0.4, 0.5] and [1.1, 1.2, 1.3, 1.4, 1.5] respectively. Parameters ---------- file_path : str The path to the user-defined pre-trained token embedding file. elem_delim : str, default ' ' The delimiter for splitting a token and every embedding vector element value on the same line of the custom pre-trained token embedding file. encoding : str, default 'utf8' The encoding scheme for reading the custom pre-trained token embedding file. kwargs : dict All other keyword arguments are passed to the TokenEmbedding initializer. Returns ------- instance of :class:`gluonnlp.embedding.TokenEmbedding` The user-defined token embedding instance. """ unknown_token = kwargs.pop('unknown_token', C.UNK_TOKEN) init_unknown_vec = kwargs.pop('init_unknown_vec', INIT_UNKNOWN_VEC) idx_to_token, idx_to_vec, unknown_token = TokenEmbedding._load_embedding( file_path, elem_delim=elem_delim, unknown_token=unknown_token, init_unknown_vec=init_unknown_vec, encoding=encoding) assert 'idx_to_vec' not in kwargs assert 'idx_to_token' not in kwargs return TokenEmbedding(unknown_token=unknown_token, init_unknown_vec=None, idx_to_token=idx_to_token, idx_to_vec=idx_to_vec, **kwargs)
[docs] def serialize(self, file_path, compress=True): """Serializes the TokenEmbedding to a file specified by file_path. TokenEmbedding is serialized by converting the list of tokens, the array of word embeddings and other metadata to numpy arrays, saving all in a single (optionally compressed) Zipfile. See https://docs.scipy.org/doc/numpy-1.14.2/neps/npy-format.html for more information on the format. Parameters ---------- file_path : str or file The path at which to create the file holding the serialized TokenEmbedding. If file is a string or a Path, the .npz extension will be appended to the file name if it is not already there. compress : bool, default True Compress the Zipfile or leave it uncompressed. """ if self.unknown_lookup is not None: warnings.warn( 'Serialization of `unknown_lookup` is not supported. ' 'Save it manually and pass the loaded lookup object ' 'during deserialization.') unknown_token = np.array(self.unknown_token) idx_to_token = np.array(self.idx_to_token, dtype='O') idx_to_vec = self.idx_to_vec.asnumpy() if not unknown_token: # Store empty string instead of None unknown_token = '' if not compress: np.savez(file=file_path, unknown_token=unknown_token, idx_to_token=idx_to_token, idx_to_vec=idx_to_vec) else: np.savez_compressed(file=file_path, unknown_token=unknown_token, idx_to_token=idx_to_token, idx_to_vec=idx_to_vec)
[docs] @staticmethod def deserialize(file_path, **kwargs): """Create a new TokenEmbedding from a serialized one. TokenEmbedding is serialized by converting the list of tokens, the array of word embeddings and other metadata to numpy arrays, saving all in a single (optionally compressed) Zipfile. See https://docs.scipy.org/doc/numpy-1.14.2/neps/npy-format.html for more information on the format. Parameters ---------- file_path : str or file The path to a file that holds the serialized TokenEmbedding. kwargs : dict Keyword arguments are passed to the TokenEmbedding initializer. Useful for attaching unknown_lookup. """ # idx_to_token is of dtype 'O' so we need to allow pickle npz_dict = np.load(file_path, allow_pickle=True) unknown_token = npz_dict['unknown_token'] if not unknown_token: unknown_token = None else: if isinstance(unknown_token, np.ndarray): if unknown_token.dtype.kind == 'S': unknown_token = unknown_token.tobytes().decode() else: unknown_token = str(unknown_token) idx_to_token = npz_dict['idx_to_token'].tolist() idx_to_vec = nd.array(npz_dict['idx_to_vec']) assert 'unknown_token' not in kwargs assert 'init_unknown_vec' not in kwargs assert 'idx_to_vec' not in kwargs assert 'idx_to_token' not in kwargs return TokenEmbedding(unknown_token=unknown_token, init_unknown_vec=None, idx_to_token=idx_to_token, idx_to_vec=idx_to_vec, **kwargs)
[docs]@register class GloVe(TokenEmbedding): """The GloVe word embedding. GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. (Source from https://nlp.stanford.edu/projects/glove/) Reference: GloVe: Global Vectors for Word Representation. Jeffrey Pennington, Richard Socher, and Christopher D. Manning. https://nlp.stanford.edu/pubs/glove.pdf Website: https://nlp.stanford.edu/projects/glove/ To get the updated URLs to the externally hosted pre-trained token embedding files, visit https://nlp.stanford.edu/projects/glove/ License for pre-trained embedding: https://opendatacommons.org/licenses/pddl/ Available sources >>> import gluonnlp as nlp >>> sorted(nlp.embedding.list_sources('GloVe')) [\ 'glove.42B.300d', 'glove.6B.100d', 'glove.6B.200d', 'glove.6B.300d', 'glove.6B.50d', \ 'glove.840B.300d', 'glove.twitter.27B.100d', 'glove.twitter.27B.200d', \ 'glove.twitter.27B.25d', 'glove.twitter.27B.50d'\ ] Parameters ---------- source : str, default 'glove.6B.50d' The name of the pre-trained token embedding file. embedding_root : str, default '$MXNET_HOME/embedding' The root directory for storing embedding-related files. MXNET_HOME defaults to '~/.mxnet'. kwargs All other keyword arguments are passed to `gluonnlp.embedding.TokenEmbedding`. Attributes ---------- idx_to_vec : mxnet.ndarray.NDArray For all the indexed tokens in this embedding, this NDArray maps each token's index to an embedding vector. unknown_token : hashable object The representation for any unknown token. In other words, any unknown token will be indexed as the same representation. """ # Map a pre-trained token embedding file and its SHA-1 hash. source_file_hash = C.GLOVE_NPZ_SHA1 def __init__(self, source='glove.6B.50d', embedding_root=os.path.join(get_home_dir(), 'embedding'), **kwargs): self._check_source(self.source_file_hash, source) pretrained_file_path = GloVe._get_file_path(self.source_file_hash, embedding_root, source) unknown_token = kwargs.pop('unknown_token', C.UNK_TOKEN) init_unknown_vec = kwargs.pop('init_unknown_vec', INIT_UNKNOWN_VEC) encoding = kwargs.pop('encoding', ENCODING) idx_to_token, idx_to_vec, unknown_token = self._load_embedding( pretrained_file_path=pretrained_file_path, elem_delim=' ', unknown_token=unknown_token, init_unknown_vec=init_unknown_vec, encoding=encoding) assert 'idx_to_vec' not in kwargs assert 'idx_to_token' not in kwargs super(GloVe, self).__init__(unknown_token=unknown_token, init_unknown_vec=None, idx_to_token=idx_to_token, idx_to_vec=idx_to_vec, **kwargs)
[docs]@register class FastText(TokenEmbedding): """The fastText word embedding. FastText is an open-source, free, lightweight library that allows users to learn text representations and text classifiers. It works on standard, generic hardware. Models can later be reduced in size to even fit on mobile devices. (Source from https://fasttext.cc/) References: Enriching Word Vectors with Subword Information. Piotr Bojanowski, Edouard Grave, Armand Joulin, and Tomas Mikolov. https://arxiv.org/abs/1607.04606 Bag of Tricks for Efficient Text Classification. Armand Joulin, Edouard Grave, Piotr Bojanowski, and Tomas Mikolov. https://arxiv.org/abs/1607.01759 FastText.zip: Compressing text classification models. Armand Joulin, Edouard Grave, Piotr Bojanowski, Matthijs Douze, Herve Jegou, and Tomas Mikolov. https://arxiv.org/abs/1612.03651 For 'wiki.multi' embedding: Word Translation Without Parallel Data Alexis Conneau, Guillaume Lample, Marc'Aurelio Ranzato, Ludovic Denoyer, and Herve Jegou. https://arxiv.org/abs/1710.04087 Website: https://fasttext.cc/ To get the updated URLs to the externally hosted pre-trained token embedding files, visit https://github.com/facebookresearch/fastText/blob/master/docs/pretrained-vectors.md License for pre-trained embedding: https://creativecommons.org/licenses/by-sa/3.0/ Available sources >>> import gluonnlp as nlp >>> sorted(nlp.embedding.list_sources('FastText')) [\ 'cc.af.300', 'cc.als.300', 'cc.am.300', 'cc.an.300', 'cc.ar.300', 'cc.arz.300', \ 'cc.as.300', 'cc.ast.300', 'cc.az.300', 'cc.azb.300', 'cc.ba.300', 'cc.bar.300', \ 'cc.bcl.300', 'cc.be.300', 'cc.bg.300', 'cc.bh.300', 'cc.bn.300', 'cc.bo.300', \ 'cc.bpy.300', 'cc.br.300', 'cc.bs.300', 'cc.ca.300', 'cc.ce.300', 'cc.ceb.300', \ 'cc.ckb.300', 'cc.co.300', 'cc.cs.300', 'cc.cv.300', 'cc.cy.300', 'cc.da.300', \ 'cc.de.300', 'cc.diq.300', 'cc.dv.300', 'cc.el.300', 'cc.eml.300', 'cc.en.300', \ 'cc.eo.300', 'cc.es.300', 'cc.et.300', 'cc.eu.300', 'cc.fa.300', 'cc.fi.300', \ 'cc.fr.300', 'cc.frr.300', 'cc.fy.300', 'cc.ga.300', 'cc.gd.300', 'cc.gl.300', \ 'cc.gom.300', 'cc.gu.300', 'cc.gv.300', 'cc.he.300', 'cc.hi.300', 'cc.hif.300', \ 'cc.hr.300', 'cc.hsb.300', 'cc.ht.300', 'cc.hu.300', 'cc.hy.300', 'cc.ia.300', \ 'cc.id.300', 'cc.ilo.300', 'cc.io.300', 'cc.is.300', 'cc.it.300', 'cc.ja.300', \ 'cc.jv.300', 'cc.ka.300', 'cc.kk.300', 'cc.km.300', 'cc.kn.300', 'cc.ko.300', \ 'cc.ku.300', 'cc.ky.300', 'cc.la.300', 'cc.lb.300', 'cc.li.300', 'cc.lmo.300', \ 'cc.lt.300', 'cc.lv.300', 'cc.mai.300', 'cc.mg.300', 'cc.mhr.300', 'cc.min.300', \ 'cc.mk.300', 'cc.ml.300', 'cc.mn.300', 'cc.mr.300', 'cc.mrj.300', 'cc.ms.300', \ 'cc.mt.300', 'cc.mwl.300', 'cc.my.300', 'cc.myv.300', 'cc.mzn.300', 'cc.nah.300', \ 'cc.nap.300', 'cc.nds.300', 'cc.ne.300', 'cc.new.300', 'cc.nl.300', 'cc.nn.300', \ 'cc.no.300', 'cc.nso.300', 'cc.oc.300', 'cc.or.300', 'cc.os.300', 'cc.pa.300', \ 'cc.pam.300', 'cc.pfl.300', 'cc.pl.300', 'cc.pms.300', 'cc.pnb.300', 'cc.ps.300', \ 'cc.pt.300', 'cc.qu.300', 'cc.rm.300', 'cc.ro.300', 'cc.ru.300', 'cc.sa.300', \ 'cc.sah.300', 'cc.sc.300', 'cc.scn.300', 'cc.sco.300', 'cc.sd.300', 'cc.sh.300', \ 'cc.si.300', 'cc.sk.300', 'cc.sl.300', 'cc.so.300', 'cc.sq.300', 'cc.sr.300', \ 'cc.su.300', 'cc.sv.300', 'cc.sw.300', 'cc.ta.300', 'cc.te.300', 'cc.tg.300', \ 'cc.th.300', 'cc.tk.300', 'cc.tl.300', 'cc.tr.300', 'cc.tt.300', 'cc.ug.300', \ 'cc.uk.300', 'cc.ur.300', 'cc.uz.300', 'cc.vec.300', 'cc.vi.300', 'cc.vls.300', \ 'cc.vo.300', 'cc.wa.300', 'cc.war.300', 'cc.xmf.300', 'cc.yi.300', 'cc.yo.300', \ 'cc.zea.300', 'cc.zh.300', 'crawl-300d-2M', 'crawl-300d-2M-subword', \ 'wiki-news-300d-1M', 'wiki-news-300d-1M-subword', 'wiki.aa', 'wiki.ab', 'wiki.ace', \ 'wiki.ady', 'wiki.af', 'wiki.ak', 'wiki.als', 'wiki.am', 'wiki.an', 'wiki.ang', \ 'wiki.ar', 'wiki.arc', 'wiki.arz', 'wiki.as', 'wiki.ast', 'wiki.av', 'wiki.ay', \ 'wiki.az', 'wiki.azb', 'wiki.ba', 'wiki.bar', 'wiki.bat_smg', 'wiki.bcl', 'wiki.be', \ 'wiki.bg', 'wiki.bh', 'wiki.bi', 'wiki.bjn', 'wiki.bm', 'wiki.bn', 'wiki.bo', \ 'wiki.bpy', 'wiki.br', 'wiki.bs', 'wiki.bug', 'wiki.bxr', 'wiki.ca', 'wiki.cbk_zam', \ 'wiki.cdo', 'wiki.ce', 'wiki.ceb', 'wiki.ch', 'wiki.cho', 'wiki.chr', 'wiki.chy', \ 'wiki.ckb', 'wiki.co', 'wiki.cr', 'wiki.crh', 'wiki.cs', 'wiki.csb', 'wiki.cu', \ 'wiki.cv', 'wiki.cy', 'wiki.da', 'wiki.de', 'wiki.diq', 'wiki.dsb', 'wiki.dv', \ 'wiki.dz', 'wiki.ee', 'wiki.el', 'wiki.eml', 'wiki.en', 'wiki.eo', 'wiki.es', \ 'wiki.et', 'wiki.eu', 'wiki.ext', 'wiki.fa', 'wiki.ff', 'wiki.fi', 'wiki.fiu_vro', \ 'wiki.fj', 'wiki.fo', 'wiki.fr', 'wiki.frp', 'wiki.frr', 'wiki.fur', 'wiki.fy', \ 'wiki.ga', 'wiki.gag', 'wiki.gan', 'wiki.gd', 'wiki.gl', 'wiki.glk', 'wiki.gn', \ 'wiki.gom', 'wiki.got', 'wiki.gu', 'wiki.gv', 'wiki.ha', 'wiki.hak', 'wiki.haw', \ 'wiki.he', 'wiki.hi', 'wiki.hif', 'wiki.ho', 'wiki.hr', 'wiki.hsb', 'wiki.ht', \ 'wiki.hu', 'wiki.hy', 'wiki.hz', 'wiki.ia', 'wiki.id', 'wiki.ie', 'wiki.ig', \ 'wiki.ii', 'wiki.ik', 'wiki.ilo', 'wiki.io', 'wiki.is', 'wiki.it', 'wiki.iu', \ 'wiki.ja', 'wiki.jam', 'wiki.jbo', 'wiki.jv', 'wiki.ka', 'wiki.kaa', 'wiki.kab', \ 'wiki.kbd', 'wiki.kg', 'wiki.ki', 'wiki.kj', 'wiki.kk', 'wiki.kl', 'wiki.km', \ 'wiki.kn', 'wiki.ko', 'wiki.koi', 'wiki.kr', 'wiki.krc', 'wiki.ks', 'wiki.ksh', \ 'wiki.ku', 'wiki.kv', 'wiki.kw', 'wiki.ky', 'wiki.la', 'wiki.lad', 'wiki.lb', \ 'wiki.lbe', 'wiki.lez', 'wiki.lg', 'wiki.li', 'wiki.lij', 'wiki.lmo', 'wiki.ln', \ 'wiki.lo', 'wiki.lrc', 'wiki.lt', 'wiki.ltg', 'wiki.lv', 'wiki.mai', 'wiki.map_bms', \ 'wiki.mdf', 'wiki.mg', 'wiki.mh', 'wiki.mhr', 'wiki.mi', 'wiki.min', 'wiki.mk', \ 'wiki.ml', 'wiki.mn', 'wiki.mo', 'wiki.mr', 'wiki.mrj', 'wiki.ms', 'wiki.mt', \ 'wiki.multi.ar', 'wiki.multi.bg', 'wiki.multi.ca', 'wiki.multi.cs', 'wiki.multi.da', \ 'wiki.multi.de', 'wiki.multi.el', 'wiki.multi.en', 'wiki.multi.es', 'wiki.multi.et', \ 'wiki.multi.fi', 'wiki.multi.fr', 'wiki.multi.he', 'wiki.multi.hr', 'wiki.multi.hu', \ 'wiki.multi.id', 'wiki.multi.it', 'wiki.multi.mk', 'wiki.multi.nl', 'wiki.multi.no', \ 'wiki.multi.pl', 'wiki.multi.pt', 'wiki.multi.ro', 'wiki.multi.ru', 'wiki.multi.sk', \ 'wiki.multi.sl', 'wiki.multi.sv', 'wiki.multi.tr', 'wiki.multi.uk', 'wiki.multi.vi', \ 'wiki.mus', 'wiki.mwl', 'wiki.my', 'wiki.myv', 'wiki.mzn', 'wiki.na', 'wiki.nah', \ 'wiki.nap', 'wiki.nds', 'wiki.nds_nl', 'wiki.ne', 'wiki.new', 'wiki.ng', 'wiki.nl', \ 'wiki.nn', 'wiki.no', 'wiki.nov', 'wiki.nrm', 'wiki.nso', 'wiki.nv', 'wiki.ny', \ 'wiki.oc', 'wiki.olo', 'wiki.om', 'wiki.or', 'wiki.os', 'wiki.pa', 'wiki.pag', \ 'wiki.pam', 'wiki.pap', 'wiki.pcd', 'wiki.pdc', 'wiki.pfl', 'wiki.pi', 'wiki.pih', \ 'wiki.pl', 'wiki.pms', 'wiki.pnb', 'wiki.pnt', 'wiki.ps', 'wiki.pt', 'wiki.qu', \ 'wiki.rm', 'wiki.rmy', 'wiki.rn', 'wiki.ro', 'wiki.roa_rup', 'wiki.roa_tara', \ 'wiki.ru', 'wiki.rue', 'wiki.rw', 'wiki.sa', 'wiki.sah', 'wiki.sc', 'wiki.scn', \ 'wiki.sco', 'wiki.sd', 'wiki.se', 'wiki.sg', 'wiki.sh', 'wiki.si', 'wiki.simple', \ 'wiki.sk', 'wiki.sl', 'wiki.sm', 'wiki.sn', 'wiki.so', 'wiki.sq', 'wiki.sr', \ 'wiki.srn', 'wiki.ss', 'wiki.st', 'wiki.stq', 'wiki.su', 'wiki.sv', 'wiki.sw', \ 'wiki.szl', 'wiki.ta', 'wiki.tcy', 'wiki.te', 'wiki.tet', 'wiki.tg', 'wiki.th', \ 'wiki.ti', 'wiki.tk', 'wiki.tl', 'wiki.tn', 'wiki.to', 'wiki.tpi', 'wiki.tr', \ 'wiki.ts', 'wiki.tt', 'wiki.tum', 'wiki.tw', 'wiki.ty', 'wiki.tyv', 'wiki.udm', \ 'wiki.ug', 'wiki.uk', 'wiki.ur', 'wiki.uz', 'wiki.ve', 'wiki.vec', 'wiki.vep', \ 'wiki.vi', 'wiki.vls', 'wiki.vo', 'wiki.wa', 'wiki.war', 'wiki.wo', 'wiki.wuu', \ 'wiki.xal', 'wiki.xh', 'wiki.xmf', 'wiki.yi', 'wiki.yo', 'wiki.za', 'wiki.zea', \ 'wiki.zh', 'wiki.zh_classical', 'wiki.zh_min_nan', 'wiki.zh_yue', 'wiki.zu'\ ] Parameters ---------- source : str, default 'wiki.simple' The name of the pre-trained token embedding file. embedding_root : str, default '$MXNET_HOME/embedding' The root directory for storing embedding-related files. MXNET_HOME defaults to '~/.mxnet'. load_ngrams : bool, default False Load vectors for ngrams so that computing vectors for OOV words is possible. This is disabled by default as it requires downloading an additional 2GB file containing the vectors for ngrams. Note that facebookresearch did not publish ngram vectors for all their models. If load_ngrams is True, but no ngram vectors are available for the chosen source this a RuntimeError is thrown. The ngram vectors are passed to the resulting TokenEmbedding as `unknown_lookup`. ctx : mx.Context, default mxnet.cpu() Context to load the FasttextEmbeddingModel for ngram vectors to. This parameter is ignored if load_ngrams is False. kwargs All other keyword arguments are passed to `gluonnlp.embedding.TokenEmbedding`. Attributes ---------- idx_to_vec : mxnet.ndarray.NDArray For all the indexed tokens in this embedding, this NDArray maps each token's index to an embedding vector. unknown_token : hashable object The representation for any unknown token. In other words, any unknown token will be indexed as the same representation. """ # Map a pre-trained token embedding file and its SHA-1 hash. source_file_hash = C.FAST_TEXT_NPZ_SHA1 source_bin_file_hash = C.FAST_TEXT_BIN_SHA1 def __init__(self, source='wiki.simple', embedding_root=os.path.join( get_home_dir(), 'embedding'), load_ngrams=False, ctx=cpu(), **kwargs): self._check_source(self.source_file_hash, source) pretrained_file_path = FastText._get_file_path(self.source_file_hash, embedding_root, source) if load_ngrams: try: self._check_source(self.source_bin_file_hash, source) except KeyError: raise KeyError( 'No ngrams are available for {}. ' 'Ngram features were published for the following embeddings: {}'. format(source, ', '.join(self.source_bin_file_hash.keys()))) pretrained_bin_file_path = FastText._get_file_path(self.source_bin_file_hash, embedding_root, source) unknown_lookup = FasttextEmbeddingModel.load_fasttext_format( pretrained_bin_file_path, ctx=ctx) else: unknown_lookup = None unknown_token = kwargs.pop('unknown_token', C.UNK_TOKEN) init_unknown_vec = kwargs.pop('init_unknown_vec', INIT_UNKNOWN_VEC) encoding = kwargs.pop('encoding', ENCODING) idx_to_token, idx_to_vec, unknown_token = self._load_embedding( pretrained_file_path=pretrained_file_path, elem_delim=' ', unknown_token=unknown_token, init_unknown_vec=init_unknown_vec, encoding=encoding) assert 'idx_to_vec' not in kwargs assert 'idx_to_token' not in kwargs super(FastText, self).__init__(unknown_token=unknown_token, init_unknown_vec=None, idx_to_token=idx_to_token, idx_to_vec=idx_to_vec, unknown_lookup=unknown_lookup, **kwargs)
[docs]@register class Word2Vec(TokenEmbedding): """The Word2Vec word embedding. Word2Vec is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed with continuous bag-of-words or skip-gram architecture for computing vector representations of words. References: [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013. [3] Tomas Mikolov, Wen-tau Yih, and Geoffrey Zweig. Linguistic Regularities in Continuous Space Word Representations. In Proceedings of NAACL HLT, 2013. Website: https://code.google.com/archive/p/word2vec/ License for pre-trained embedding: Unspecified Available sources >>> import gluonnlp as nlp >>> sorted(nlp.embedding.list_sources('Word2Vec')) [\ 'GoogleNews-vectors-negative300', 'freebase-vectors-skipgram1000', \ 'freebase-vectors-skipgram1000-en'\ ] Parameters ---------- source : str, default 'GoogleNews-vectors-negative300' The name of the pre-trained token embedding file. A binary pre-trained file outside from the source list can be used for this constructor by passing the path to it which ends with .bin as file extension name. embedding_root : str, default '$MXNET_HOME/embedding' The root directory for storing embedding-related files. MXNET_HOME defaults to '~/.mxnet'. kwargs All other keyword arguments are passed to `gluonnlp.embedding.TokenEmbedding`. Attributes ---------- idx_to_vec : mxnet.ndarray.NDArray For all the indexed tokens in this embedding, this NDArray maps each token's index to an embedding vector. unknown_token : hashable object The representation for any unknown token. In other words, any unknown token will be indexed as the same representation. """ # Map a pre-trained token embedding file and its SHA-1 hash. source_file_hash = C.WORD2VEC_NPZ_SHA1 def __init__(self, source='GoogleNews-vectors-negative300', embedding_root=os.path.join(get_home_dir(), 'embedding'), encoding=ENCODING, **kwargs): unknown_token = kwargs.pop('unknown_token', C.UNK_TOKEN) init_unknown_vec = kwargs.pop('init_unknown_vec', INIT_UNKNOWN_VEC) if source.endswith('.bin'): pretrained_file_path = os.path.expanduser(source) idx_to_token, idx_to_vec, unknown_token = self._load_w2v_binary( pretrained_file_path, unknown_token=unknown_token, init_unknown_vec=init_unknown_vec, encoding=encoding) else: self._check_source(self.source_file_hash, source) pretrained_file_path = self._get_file_path(self.source_file_hash, embedding_root, source) idx_to_token, idx_to_vec, unknown_token = self._load_embedding( pretrained_file_path=pretrained_file_path, elem_delim=' ', unknown_token=unknown_token, init_unknown_vec=init_unknown_vec, encoding=encoding) assert 'idx_to_vec' not in kwargs assert 'idx_to_token' not in kwargs super(Word2Vec, self).__init__(unknown_token=unknown_token, init_unknown_vec=None, idx_to_token=idx_to_token, idx_to_vec=idx_to_vec, **kwargs) @classmethod def _load_w2v_binary(cls, pretrained_file_path, unknown_token, init_unknown_vec=INIT_UNKNOWN_VEC, encoding=ENCODING): """Load embedding vectors from a binary pre-trained token embedding file. Parameters ---------- pretrained_file_path: str The path to a binary pre-trained token embedding file end with .bin as file extension name. encoding: str The encoding type of the file. """ idx_to_token = [unknown_token] if unknown_token else [] unk_idx = None if unknown_token: unk_idx = 0 all_elems = [] tokens = set() loaded_unknown_vec = None pretrained_file_path = os.path.expanduser(pretrained_file_path) with io.open(pretrained_file_path, 'rb') as f: header = f.readline().decode(encoding=encoding) # pytype: disable=attribute-error vocab_size, vec_len = (int(x) for x in header.split()) if unknown_token: # Reserve a vector slot for the unknown token at the very beggining # because the unknown token index is 0. all_elems.extend([0] * vec_len) binary_len = np.dtype(np.float32).itemsize * vec_len for line_num in range(vocab_size): token = [] while True: ch = f.read(1) if ch == b' ': break if ch == b'': raise EOFError('unexpected end of input; is count incorrect or file ' 'otherwise damaged?') if ch != b'\n': # ignore newlines in front of words (some binary files have) token.append(ch) try: token = b''.join(token).decode(encoding=encoding) except ValueError: warnings.warn('line {} in {}: failed to decode. Skipping.' .format(line_num, pretrained_file_path)) continue elems = np.frombuffer(f.read(binary_len), dtype=np.float32) assert len(elems) > 1, 'line {} in {}: unexpected data format.'.format( line_num, pretrained_file_path) if token == unknown_token and loaded_unknown_vec is None: loaded_unknown_vec = elems tokens.add(unknown_token) elif token in tokens: warnings.warn('line {} in {}: duplicate embedding found for ' 'token "{}". Skipped.'.format(line_num, pretrained_file_path, token)) else: assert len(elems) == vec_len, \ 'line {} in {}: found vector of inconsistent dimension for token ' \ '"{}". expected dim: {}, found: {}'.format(line_num, pretrained_file_path, token, vec_len, len(elems)) all_elems.extend(elems) idx_to_token.append(token) tokens.add(token) idx_to_vec = nd.array(all_elems).reshape((-1, vec_len)) if unknown_token: if loaded_unknown_vec is None: idx_to_vec[unk_idx] = init_unknown_vec(shape=vec_len) else: idx_to_vec[unk_idx] = nd.array(loaded_unknown_vec) return idx_to_token, idx_to_vec, unknown_token
[docs] @classmethod def from_w2v_binary(cls, pretrained_file_path, encoding=ENCODING): """Load embedding vectors from a binary pre-trained token embedding file. Parameters ---------- pretrained_file_path: str The path to a binary pre-trained token embedding file end with .bin as file extension name. encoding: str The encoding type of the file. """ return cls(source=pretrained_file_path, encoding=encoding)