Source code for gluonnlp.data.conll

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# pylint: disable=
"""CoNLL format corpora."""

__all__ = ['CoNLL2000', 'CoNLL2001', 'CoNLL2002', 'CoNLL2004', 'UniversalDependencies21']

import codecs
import glob
import gzip
import io
import os
import shutil
import tarfile

from mxnet.gluon.data import SimpleDataset
from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url

from .. import _constants as C
from .registry import register
from ..base import get_home_dir


class _CoNLLSequenceTagging(SimpleDataset):
    def __init__(self, segment, root, has_comment=False):
        root = os.path.expanduser(root)
        os.makedirs(root, exist_ok=True)
        self._segment = segment
        self._root = root
        self._has_comment = has_comment
        super(_CoNLLSequenceTagging, self).__init__(self._read_data())

    def _get_data_file_hash(self):
        assert self._segment in self._data_file, \
                'Segment "{}" is not available. Options are: {}.'.format(self._segment,
                                                                         self._data_files.keys())
        return [self._data_file[self._segment]]

    def _get_data_archive_hash(self):
        return self._get_data_file_hash()[0]

    def _extract_archive(self):
        pass

    def _get_data(self):
        archive_file_name, archive_hash = self._get_data_archive_hash()
        paths = []
        for data_file_name, data_hash in self._get_data_file_hash():
            root = self._root
            path = os.path.join(root, data_file_name)
            if hasattr(self, 'namespace'):
                url = _get_repo_file_url(self.namespace, archive_file_name)
            else:
                url = self.base_url + archive_file_name
            if not os.path.exists(path) or not check_sha1(path, data_hash):
                download(url, path=root, sha1_hash=archive_hash)
                self._extract_archive()
            paths.append(path)
        return paths

    def _read_data(self):
        paths = self._get_data()
        results = []
        for path in paths:
            with gzip.open(path, 'r') if path.endswith('gz') else io.open(path, 'rb') as f:
                line_iter = codecs.getreader(self.codec)\
                    (io.BufferedReader(f))  # pytype: disable=wrong-arg-types
                results.append(self._process_iter(line_iter))
        return list([x for field in item for x in field] for item in zip(*results))

    def _process_iter(self, line_iter):
        samples = []
        buf = []
        for line in line_iter:
            if not buf and line.startswith('#') and self._has_comment:
                continue
            line = line.split()
            if line:
                buf.append(line)
            elif buf:
                samples.append(tuple(map(list, zip(*buf))))
                buf = []
        if buf:
            samples.append(tuple(map(list, zip(*buf))))
        return samples


[docs]@register(segment=['train', 'test'])
class CoNLL2000(_CoNLLSequenceTagging):
    """CoNLL2000 Part-of-speech (POS) tagging and chunking joint task dataset.

    Each sample has three fields: word, POS tag, chunk label.

    From
    https://www.clips.uantwerpen.be/conll2000/chunking/

    Parameters
    ----------
    segment : {'train', 'test'}, default 'train'
        Dataset segment.
    root : str, default '$MXNET_HOME/datasets/conll2000'
        Path to temp folder for storing data.
        MXNET_HOME defaults to '~/.mxnet'.

    Examples
    --------
    >>> conll = gluonnlp.data.CoNLL2000('test', root='./datasets/conll2000')
    -etc-
    >>> len(conll)
    2012
    >>> len(conll[0])
    3
    >>> conll[8][0]
    ['SHEARSON', 'LEHMAN', 'HUTTON', 'Inc', '.']
    >>> conll[8][1]
    ['NNP', 'NNP', 'NNP', 'NNP', '.']
    >>> conll[8][2]
    ['B-NP', 'I-NP', 'I-NP', 'I-NP', 'O']
    """
    def __init__(self, segment='train',
                 root=os.path.join(get_home_dir(), 'datasets', 'conll2000')):
        self._data_file = {'train': ('train.txt.gz',
                                     '9f31cf936554cebf558d07cce923dca0b7f31864'),
                           'test': ('test.txt.gz',
                                    'dc57527f1f60eeafad03da51235185141152f849')}
        super(CoNLL2000, self).__init__(segment, root)

    base_url = 'https://www.clips.uantwerpen.be/conll2000/chunking/'
    codec = 'utf-8'


[docs]@register(segment=['train', 'testa', 'testb'], part=[1, 2, 3])
class CoNLL2001(_CoNLLSequenceTagging):
    """CoNLL2001 Clause Identification dataset.

    Each sample has four fields: word, POS tag, chunk label, clause tag.

    From
    https://www.clips.uantwerpen.be/conll2001/clauses/

    Parameters
    ----------
    part : int, {1, 2, 3}
        Part number of the dataset.
    segment : {'train', 'testa', 'testb'}, default 'train'
        Dataset segment.
    root : str, default '$MXNET_HOME/datasets/conll2001'
        Path to temp folder for storing data.
        MXNET_HOME defaults to '~/.mxnet'.

    Examples
    --------
    >>> conll = gluonnlp.data.CoNLL2001(1, 'testa', root='./datasets/conll2001')
    -etc-
    >>> len(conll)
    2012
    >>> len(conll[0])
    4
    >>> conll[8][0]
    ['SHEARSON', 'LEHMAN', 'HUTTON', 'Inc', '.']
    >>> conll[8][1]
    ['NNP', 'NNP', 'NNP', 'NNP', '.']
    >>> conll[8][2]
    ['B-NP', 'I-NP', 'I-NP', 'I-NP', 'O']
    >>> conll[8][3]
    ['X', 'X', 'X', 'X', 'X']
    """
    def __init__(self, part, segment='train',
                 root=os.path.join(get_home_dir(), 'datasets', 'conll2001')):
        self._part = part
        self._data_file = [
            {'train': ('train1',
                       '115400d32437a86af85fbd549c1297775aec5996'),
             'testa': ('testa1',
                       '0fad761a9c3e0fece80550add3420554619bce66'),
             'testb': ('testb1',
                       'f1075e69b57a9c8e5e5de8496610469dcaaca511')},
            {'train': ('train2',
                       'd48cf110875e5999e20e72bc446c9dd19fdde618'),
             'testa': ('testa2',
                       '27262d3a45e6b08631d8c2c8d8c33cf7fd63db2c'),
             'testb': ('testb2',
                       'd8d0b5819ca5e275c25cec0287ffff8811e65321')},
            {'train': ('train3',
                       'c064ba4cb54f81a3d1e15d48cc990dee55a326bc'),
             'testa': ('testa3',
                       'c0c11cceb17bba8e0aaad0368d8b0b869c4959f7'),
             'testb': ('testb3',
                       'a37f3ca89eb4db08fc576f50161f6c2945302541')}
            ]
        super(CoNLL2001, self).__init__(segment, root)

    base_url = 'https://www.clips.uantwerpen.be/conll2001/clauses/data/'
    codec = 'utf-8'

    def _get_data_file_hash(self):
        assert self._part in [1, 2, 3], \
                'Part "{}" is not availble. Options are 1, 2, 3.'.format(self._part)
        available_segments = self._data_file[self._part-1].keys()
        assert self._segment in available_segments, \
                'Segment "{}" is not available. Options are: {}.'.format(self._segment,
                                                                         available_segments)
        return [self._data_file[self._part-1][self._segment]]


[docs]@register(segment=['train', 'testa', 'testb'], lang=['esp', 'ned'])
class CoNLL2002(_CoNLLSequenceTagging):
    """CoNLL2002 Named Entity Recognition (NER) task dataset.

    For 'esp', each sample has two fields: word, NER label.

    For 'ned', each sample has three fields: word, POS tag, NER label.

    From
    https://www.clips.uantwerpen.be/conll2002/ner/

    Parameters
    ----------
    lang : str, {'esp', 'ned'}
        Dataset language.
    segment : {'train', 'testa', 'testb'}, default 'train'
        Dataset segment.
    root : str, default '$MXNET_HOME/datasets/conll2002'
        Path to temp folder for storing data.
        MXNET_HOME defaults to '~/.mxnet'.

    Examples
    --------
    >>> conll = gluonnlp.data.CoNLL2002('esp', 'testa', root='./datasets/conll2002')
    -etc-
    >>> len(conll)
    1915
    >>> len(conll[0])
    2
    >>> conll[0][0]
    ['Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.']
    >>> conll[0][1]
    ['B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O']
    """
    def __init__(self, lang, segment='train',
                 root=os.path.join(get_home_dir(), 'datasets', 'conll2002')):
        self._lang = lang
        self._data_file = {
            'esp': {'train': ('esp.train.gz',
                              '2f25c8c1a724009f440af8bb3c03710f089dfe11'),
                    'testa': ('esp.testa.gz',
                              '1afd035a29419b1a9531308cae6157c624260693'),
                    'testb': ('esp.testb.gz',
                              'c6a16bcb0399bf212fec80d6049eaeffcdb58c1d')},
            'ned': {'train': ('ned.train.gz',
                              '4282015737b588efa13e6616222d238247a85c67'),
                    'testa': ('ned.testa.gz',
                              '7584cbf55692d3b0c133de6d7411ad04ae0e710a'),
                    'testb': ('ned.testb.gz',
                              '4d07c576f99aae8a305855a9cbf40163c0b8d84e')}}
        super(CoNLL2002, self).__init__(segment, root)

    base_url = 'https://www.clips.uantwerpen.be/conll2002/ner/data/'
    codec = 'latin-1'

    def _get_data_file_hash(self):
        assert self._lang in self._data_file, \
                'Language "{}" is not available. Options are "{}".'.format(self._lang,
                                                                           self._data_file.keys())
        available_segments = self._data_file[self._lang].keys()
        assert self._segment in available_segments, \
                'Segment "{}" is not available. Options are: {}.'.format(self._segment,
                                                                         available_segments)
        return [self._data_file[self._lang][self._segment]]


[docs]@register(segment=['train', 'dev', 'test'])
class CoNLL2004(_CoNLLSequenceTagging):
    """CoNLL2004 Semantic Role Labeling (SRL) task dataset.

    Each sample has six or more fields: word, POS tag, chunk label, clause tag, NER label,
    target verbs, and sense labels (of variable number per sample).

    From
    http://www.cs.upc.edu/~srlconll/st04/st04.html

    Parameters
    ----------
    segment : {'train', 'dev', 'test'}, default 'train'
        Dataset segment.
    root : str, default '$MXNET_HOME/datasets/conll2004'
        Path to temp folder for storing data.
        MXNET_HOME defaults to '~/.mxnet'.

    Examples
    --------
    >>> conll = gluonnlp.data.CoNLL2004('dev', root='./datasets/conll2004')
    -etc-
    >>> len(conll)
    2012
    >>> len(conll[8])
    6
    >>> conll[8][0]
    ['SHEARSON', 'LEHMAN', 'HUTTON', 'Inc', '.']
    >>> conll[8][1]
    ['NNP', 'NNP', 'NNP', 'NNP', '.']
    >>> conll[8][2]
    ['B-NP', 'I-NP', 'I-NP', 'I-NP', 'O']
    >>> conll[8][3]
    ['*', '*', '*', '*', '*']
    >>> conll[8][4]
    ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O']
    >>> conll[8][5]
    ['-', '-', '-', '-', '-']
    """
    def __init__(self, segment='train',
                 root=os.path.join(get_home_dir(), 'datasets', 'conll2004')):
        self._archive_file = ('conll04st-release.tar.gz',
                              '09ef957d908d34fa0abd745cbe43e279414f076c')
        self._data_file = {
            'word': {'train': ('words.train.gz',
                               '89ac63dcdcffc71601a224be6ada7f2e67c8e61f'),
                     'dev': ('words.dev.gz',
                             'c3e59d75ae6bbeb76ee78e52a7a7c6b52abc5b6f'),
                     'test': ('words.test.gz',
                              '61c7653732d83b51593ed29ae7ff45cd8277c8b5')},
            'synt': {'train': ('synt.train.pred.gz',
                               '43ed796f953dcf00db52ec593ed3377aa440d838'),
                     'dev': ('synt.dev.pred.gz',
                             'c098ca8a265fb67529c90eee5a93f6781ad87747'),
                     'test': ('synt.test.pred.gz',
                              '272c2856171f3e28e3512906ee07019bac90a6b2')},
            'ne': {'train': ('ne.train.pred.gz',
                             'd10e8b11b6b856efac978697af75cf582cac6e86'),
                   'dev': ('ne.dev.pred.gz',
                           '7883f76f28675d2a7247be527967b846494bbe2c'),
                   'test': ('ne.test.pred.gz',
                            'f1a52a58bb96e07e0288479a4a633476d8211963')},
            'props': {'train': ('props.train.gz',
                                'c67bb4546e9110ce39ce063624c7a0adf65ea795'),
                      'dev': ('props.dev.gz',
                              '7e232a4113d1a7e68b719a2781f09399ebf39956'),
                      'test': ('props.test.gz',
                               '639d54e24cebd7476b05c0efc0cbb019ebe52d8e')}}

        super(CoNLL2004, self).__init__(segment, root)

    base_url = 'http://www.cs.upc.edu/~srlconll/st04/'
    namespace = 'gluon/dataset/conll'
    codec = 'utf-8'

    def _get_data_file_hash(self):
        available_segments = self._data_file['ne'].keys()
        assert self._segment in self._data_file['ne'], \
                'Segment "{}" is not available. Options are: {}'.format(self._segment,
                                                                        available_segments)
        return [self._data_file[part][self._segment] for part in ['word', 'synt', 'ne', 'props']]

    def _get_data_archive_hash(self):
        return self._archive_file

    def _extract_archive(self):
        archive_file_name, _ = self._get_data_archive_hash()
        root = self._root
        path = os.path.join(root, archive_file_name)
        with tarfile.open(path, 'r:gz') as tar:
            tar.extractall(path=root)
        for fn in glob.glob(os.path.join(root, 'conll04st-release', '*.gz')):
            shutil.copy(fn, root)
        shutil.rmtree(os.path.join(root, 'conll04st-release'), ignore_errors=True)


[docs]@register(segment=['train', 'dev', 'test'],
          lang=list(C.UD21_DATA_FILE_SHA1.keys()))
class UniversalDependencies21(_CoNLLSequenceTagging):
    """Universal dependencies tree banks.

    Each sample has 8 or more fields as described in
    http://universaldependencies.org/docs/format.html

    From
    https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2515

    Parameters
    ----------
    lang : str, default 'en'
        Dataset language.
    segment : str, default 'train'
        Dataset segment.
    root : str, default '$MXNET_HOME/datasets/ud2.1'
        Path to temp folder for storing data.
        MXNET_HOME defaults to '~/.mxnet'.

    Examples
    --------
    >>> ud = gluonnlp.data.UniversalDependencies21('en', 'dev', root='./datasets/ud21')
    -etc-
    >>> len(ud)
    2002
    >>> len(ud[0])
    10
    >>> ud[0][0]
    ['1', '2', '3', '4', '5', '6', '7']
    >>> ud[0][1]
    ['From', 'the', 'AP', 'comes', 'this', 'story', ':']
    >>> ud[0][2]
    ['from', 'the', 'AP', 'come', 'this', 'story', ':']
    >>> ud[0][3]
    ['ADP', 'DET', 'PROPN', 'VERB', 'DET', 'NOUN', 'PUNCT']
    >>> ud[0][4]
    ['IN', 'DT', 'NNP', 'VBZ', 'DT', 'NN', ':']
    >>> ud[0][5][:3]
    ['_', 'Definite=Def|PronType=Art', 'Number=Sing']
    >>> ud[0][6]
    ['3', '3', '4', '0', '6', '4', '4']
    >>> ud[0][7]
    ['case', 'det', 'obl', 'root', 'det', 'nsubj', 'punct']
    >>> ud[0][8]
    ['3:case', '3:det', '4:obl', '0:root', '6:det', '4:nsubj', '4:punct']
    >>> ud[0][9]
    ['_', '_', '_', '_', '_', '_', '_']
    """
    def __init__(self, lang='en', segment='train',
                 root=os.path.join(get_home_dir(), 'datasets', 'ud2.1')):
        self._archive_file = ('ud-treebanks-v2.1.tgz',
                              '77657b897951e21d2eca6b29958e663964eb57ae')
        self._lang = lang
        self._data_file = C.UD21_DATA_FILE_SHA1

        super(UniversalDependencies21, self).__init__(segment, root, True)

    base_url = 'https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2515/'
    codec = 'utf-8'

    def _get_data_file_hash(self):
        assert self._lang in self._data_file, \
                'Language "{}" is not available. Options are {}.'.format(
                    self._lang, list(self._data_file.keys()))
        available_segments = self._data_file[self._lang].keys()
        assert self._segment in available_segments, \
                'Segment "{}" is not available for language "{}". ' \
                'Options are: {}.'.format(self._segment, self._lang, list(available_segments))
        return [self._data_file[self._lang][self._segment]]

    def _get_data_archive_hash(self):
        return self._archive_file

    def _extract_archive(self):
        archive_file_name, _ = self._get_data_archive_hash()
        root = self._root
        path = os.path.join(root, archive_file_name)
        with tarfile.open(path, 'r:gz') as tar:
            tar.extractall(path=root)
        for fn in glob.glob(os.path.join(root, 'ud-treebanks-v2.1', '*', '*.conllu')):
            shutil.copy(fn, root)
        for data_license in glob.glob(os.path.join(root, 'ud-treebanks-v2.1', '*', 'LICENSE.txt')):
            lang = os.path.dirname(data_license).split(os.path.sep)[-1]
            shutil.copy(data_license, os.path.join(root, '{}_LICENSE.txt'.format(lang)))
        shutil.rmtree(os.path.join(root, 'ud-treebanks-v2.1'), ignore_errors=True)