Source code for gluonnlp.data.corpora.wikitext

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# pylint: disable=
"""WikiText corpora."""

__all__ = ['WikiText2', 'WikiText103', 'WikiText2Raw', 'WikiText103Raw']

import os
import shutil
import zipfile

from mxnet.gluon.utils import _get_repo_file_url, check_sha1, download

from ... import _constants as C
from ..dataset import CorpusDataset
from ..registry import register
from ...base import get_home_dir


class _WikiText(CorpusDataset):
    def __init__(self, namespace, segment, bos, eos, flatten, skip_empty, root,
                 **kwargs):
        root = os.path.expanduser(root)
        os.makedirs(root, exist_ok=True)
        self._root = root
        self._namespace = 'gluon/dataset/{}'.format(namespace)
        self._segment = segment
        super(_WikiText, self).__init__(
            self._get_data(),
            bos=bos,
            eos=eos,
            flatten=flatten,
            skip_empty=skip_empty,
            **kwargs)

    def _get_data(self):
        archive_file_name, archive_hash = self._archive_file
        data_file_name, data_hash = self._data_file[self._segment]
        root = self._root
        path = os.path.join(root, data_file_name)
        if not os.path.exists(path) or not check_sha1(path, data_hash):
            downloaded_file_path = download(_get_repo_file_url(self._namespace, archive_file_name),
                                            path=root,
                                            sha1_hash=archive_hash)

            with zipfile.ZipFile(downloaded_file_path, 'r') as zf:
                for member in zf.namelist():
                    filename = os.path.basename(member)
                    if filename:
                        dest = os.path.join(root, filename)
                        with zf.open(member) as source, \
                                open(dest, 'wb') as target:
                            shutil.copyfileobj(source, target)
        return path


[docs]@register(segment=['train', 'val', 'test'])
class WikiText2(_WikiText):
    """WikiText-2 word-level dataset for language modeling, from Salesforce research.

    WikiText2 is implemented as CorpusDataset with the default flatten=True.

    From
    https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/

    License: Creative Commons Attribution-ShareAlike

    Parameters
    ----------
    segment : {'train', 'val', 'test'}, default 'train'
        Dataset segment.
    flatten : bool, default True
        Whether to return all samples as flattened tokens. If True, each sample is a token.
    skip_empty : bool, default True
        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
        will be added in empty samples.
    tokenizer : function, default str.split
        A function that splits each sample string into list of tokens.
    bos : str or None, default None
        The token to add at the beginning of each sentence. If None, nothing is added.
    eos : str or None, default '<eos>'
        The token to add at the end of each sentence. If None, nothing is added.
    root : str, default '$MXNET_HOME/datasets/wikitext-2'
        Path to temp folder for storing data.
        MXNET_HOME defaults to '~/.mxnet'.

    Examples
    --------
    >>> wikitext2 = gluonnlp.data.WikiText2('val', root='./datasets/wikitext2')
    -etc-
    >>> len(wikitext2)
    216347
    >>> wikitext2[0]
    '='
    >>> wikitext2 = gluonnlp.data.WikiText2('val', flatten=False,
    ...                                     root='./datasets/wikitext2')
    >>> len(wikitext2)
    2461
    >>> wikitext2[0]
    ['=', 'Homarus', 'gammarus', '=', '<eos>']
    >>> wikitext2 = gluonnlp.data.WikiText2('val', flatten=False, bos='<bos>', eos=None,
    ...                                     root='./datasets/wikitext2')
    >>> wikitext2[0]
    ['<bos>', '=', 'Homarus', 'gammarus', '=']
    >>> wikitext2 = gluonnlp.data.WikiText2('val', flatten=False, bos='<bos>', eos=None,
    ...                                     skip_empty=False, root='./datasets/wikitext2')
    >>> len(wikitext2)
    3760
    >>> wikitext2[0]
    ['<bos>']
    """

    def __init__(self,
                 segment='train',
                 flatten=True,
                 skip_empty=True,
                 tokenizer=lambda s: s.split(),
                 bos=None,
                 eos=C.EOS_TOKEN,
                 root=os.path.join(get_home_dir(), 'datasets', 'wikitext-2'),
                 **kwargs):
        self._archive_file = ('wikitext-2-v1.zip',
                              '3c914d17d80b1459be871a5039ac23e752a53cbe')
        self._data_file = {
            'train': ('wiki.train.tokens',
                      '863f29c46ef9d167fff4940ec821195882fe29d1'),
            'val': ('wiki.valid.tokens',
                    '0418625c8b4da6e4b5c7a0b9e78d4ae8f7ee5422'),
            'test': ('wiki.test.tokens',
                     'c7b8ce0aa086fb34dab808c5c49224211eb2b172')
        }
        super(WikiText2, self).__init__(
            'wikitext-2',
            segment=segment,
            bos=bos,
            eos=eos,
            flatten=flatten,
            skip_empty=skip_empty,
            root=root,
            tokenizer=tokenizer,
            **kwargs)


[docs]@register(segment=['train', 'val', 'test'])
class WikiText103(_WikiText):
    """WikiText-103 word-level dataset for language modeling, from Salesforce research.

    WikiText103 is implemented as CorpusDataset with the default flatten=True.

    From
    https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/

    License: Creative Commons Attribution-ShareAlike

    Parameters
    ----------
    segment : {'train', 'val', 'test'}, default 'train'
        Dataset segment.
    flatten : bool, default True
        Whether to return all samples as flattened tokens. If True, each sample is a token.
    skip_empty : bool, default True
        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
        will be added in empty samples.
    tokenizer : function, default str.split
        A function that splits each sample string into list of tokens.
    bos : str or None, default None
        The token to add at the beginning of each sentence. If None, nothing is added.
    eos : str or None, default '<eos>'
        The token to add at the end of each sentence. If None, nothing is added.
    root : str, default '$MXNET_HOME/datasets/wikitext-103'
        Path to temp folder for storing data.
        MXNET_HOME defaults to '~/.mxnet'.

    Examples
    --------
    >>> wikitext103 = gluonnlp.data.WikiText103('val', root='./datasets/wikitext103')
    -etc-
    >>> len(wikitext103)
    216347
    >>> wikitext103[0]
    '='
    >>> wikitext103 = gluonnlp.data.WikiText103('val', flatten=False,
    ...                                         root='./datasets/wikitext103')
    >>> len(wikitext103)
    2461
    >>> wikitext103[0]
    ['=', 'Homarus', 'gammarus', '=', '<eos>']
    >>> wikitext103 = gluonnlp.data.WikiText103('val', flatten=False, bos='<bos>', eos=None,
    ...                                         root='./datasets/wikitext103')
    >>> wikitext103[0]
    ['<bos>', '=', 'Homarus', 'gammarus', '=']
    >>> wikitext103 = gluonnlp.data.WikiText103('val', flatten=False, bos='<bos>', eos=None,
    ...                                         skip_empty=False, root='./datasets/wikitext103')
    >>> len(wikitext103)
    3760
    >>> wikitext103[0]
    ['<bos>']
    """

    def __init__(self,
                 segment='train',
                 flatten=True,
                 skip_empty=True,
                 tokenizer=lambda s: s.split(),
                 bos=None,
                 eos=C.EOS_TOKEN,
                 root=os.path.join(get_home_dir(), 'datasets',
                                   'wikitext-103'),
                 **kwargs):
        self._archive_file = ('wikitext-103-v1.zip',
                              '0aec09a7537b58d4bb65362fee27650eeaba625a')
        self._data_file = {
            'train': ('wiki.train.tokens',
                      'b7497e2dfe77e72cfef5e3dbc61b7b53712ac211'),
            'val': ('wiki.valid.tokens',
                    'c326ac59dc587676d58c422eb8a03e119582f92b'),
            'test': ('wiki.test.tokens',
                     '8a5befc548865cec54ed4273cf87dbbad60d1e47')
        }
        super(WikiText103, self).__init__(
            'wikitext-103',
            segment=segment,
            bos=bos,
            eos=eos,
            flatten=flatten,
            skip_empty=skip_empty,
            root=root,
            tokenizer=tokenizer,
            **kwargs)


[docs]@register(segment=['train', 'val', 'test'])
class WikiText2Raw(_WikiText):
    """WikiText-2 character-level dataset for language modeling

    WikiText2Raw is implemented as CorpusDataset with the default flatten=True.

    From Salesforce research:
    https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/

    License: Creative Commons Attribution-ShareAlike

    Parameters
    ----------
    segment : {'train', 'val', 'test'}, default 'train'
        Dataset segment.
    flatten : bool, default True
        Whether to return all samples as flattened tokens. If True, each sample is a token.
    skip_empty : bool, default True
        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
        will be added in empty samples.
    tokenizer : function, default s.encode('utf-8')
        A function that splits each sample string into list of tokens.
        The tokenizer can also be used to convert everything to lowercase.
        E.g. with tokenizer=lambda s: s.lower().encode('utf-8')
    bos : str or None, default None
        The token to add at the beginning of each sentence. If None, nothing is added.
    eos : str or None, default '<eos>'
        The token to add at the end of each sentence. If None, nothing is added.
    root : str, default '$MXNET_HOME/datasets/wikitext-2'
        Path to temp folder for storing data.
        MXNET_HOME defaults to '~/.mxnet'.

    Examples
    --------
    >>> wikitext2 = gluonnlp.data.WikiText2Raw('val', root='./datasets/wikitext2')
    -etc-
    >>> len(wikitext2)
    1136862
    >>> wikitext2[0]
    61
    >>> type(wikitext2[0])
    <class 'int'>
    >>> wikitext2 = gluonnlp.data.WikiText2Raw('val', flatten=False,
    ...                                        tokenizer=None, root='./datasets/wikitext2')
    >>> len(wikitext2)
    2461
    >>> wikitext2[0]
    '= Homarus gammarus ='
    >>> wikitext2 = gluonnlp.data.WikiText2Raw('val', flatten=False, bos='<bos>', eos=None,
    ...                                        tokenizer=lambda s: s.split(),
    ...                                        root='./datasets/wikitext2')
    >>> wikitext2[0]
    ['<bos>', '=', 'Homarus', 'gammarus', '=']
    """

    def __init__(self,
                 segment='train',
                 flatten=True,
                 skip_empty=True,
                 bos=None,
                 eos=None,
                 tokenizer=lambda s: s.encode('utf-8'),
                 root=os.path.join(get_home_dir(), 'datasets', 'wikitext-2'),
                 **kwargs):
        self._archive_file = ('wikitext-2-raw-v1.zip',
                              '3b6993c138fc61c95f7fffd900fef68f8411371d')
        self._data_file = {
            'train': ('wiki.train.raw',
                      'd33faf256327882db0edc7c67cd098d1051a2112'),
            'val': ('wiki.valid.raw',
                    'db78d4db83700cba1b1bf4a9381087043db2876d'),
            'test': ('wiki.test.raw',
                     '6f1fe2054a940eebfc76b284b09680763b37f5ea')
        }

        super(WikiText2Raw, self).__init__(
            'wikitext-2',
            segment=segment,
            bos=bos,
            eos=eos,
            flatten=flatten,
            skip_empty=skip_empty,
            root=root,
            tokenizer=tokenizer,
            **kwargs)


[docs]@register(segment=['train', 'val', 'test'])
class WikiText103Raw(_WikiText):
    """WikiText-103 character-level dataset for language modeling

    WikiText103Raw is implemented as CorpusDataset with the default flatten=True.

    From Salesforce research:
    https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/

    License: Creative Commons Attribution-ShareAlike

    Parameters
    ----------
    segment : {'train', 'val', 'test'}, default 'train'
        Dataset segment.
    flatten : bool, default True
        Whether to return all samples as flattened tokens. If True, each sample is a token.
    skip_empty : bool, default True
        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
        will be added in empty samples.
    tokenizer : function, default s.encode('utf-8')
        A function that splits each sample string into list of tokens.
        The tokenizer can also be used to convert everything to lowercase.
        E.g. with tokenizer=lambda s: s.lower().encode('utf-8')
    bos : str or None, default None
        The token to add at the beginning of each sentence. If None, nothing is added.
    eos : str or None, default '<eos>'
        The token to add at the end of each sentence. If None, nothing is added.
    root : str, default '$MXNET_HOME/datasets/wikitext-103'
        Path to temp folder for storing data.
        MXNET_HOME defaults to '~/.mxnet'.

    Examples
    --------
    >>> wikitext103 = gluonnlp.data.WikiText103Raw('val', root='./datasets/wikitext103')
    -etc-
    >>> len(wikitext103)
    1136862
    >>> wikitext103[0]
    61
    >>> wikitext103 = gluonnlp.data.WikiText103Raw('val', flatten=False,
    ...                                            root='./datasets/wikitext103')
    >>> len(wikitext103)
    2461
    >>> wikitext103[0]
    [61, 32, 72, 111, 109, 97, 114, 117, 115, 32, 103, 97, 109, 109, 97, 114, 117, 115, 32, 61]
    >>> wikitext103 = gluonnlp.data.WikiText103Raw('val', flatten=False, tokenizer=None,
    ...                                            root='./datasets/wikitext103')
    >>> wikitext103[0]
    '= Homarus gammarus ='
    """

    def __init__(self,
                 segment='train',
                 flatten=True,
                 skip_empty=True,
                 tokenizer=lambda s: s.encode('utf-8'),
                 bos=None,
                 eos=None,
                 root=os.path.join(get_home_dir(), 'datasets',
                                   'wikitext-103'),
                 **kwargs):
        self._archive_file = ('wikitext-103-raw-v1.zip',
                              '86f2375181b9247049d9c9205fad2b71b274b568')
        self._data_file = {
            'train': ('wiki.train.raw',
                      '3d06627c15e834408cfee91293f862c11c1cc9ef'),
            'val': ('wiki.valid.raw',
                    'db78d4db83700cba1b1bf4a9381087043db2876d'),
            'test': ('wiki.test.raw',
                     '6f1fe2054a940eebfc76b284b09680763b37f5ea')
        }
        super(WikiText103Raw, self).__init__(
            'wikitext-103',
            segment=segment,
            bos=bos,
            eos=eos,
            flatten=flatten,
            skip_empty=skip_empty,
            root=root,
            tokenizer=tokenizer,
            **kwargs)