Source code for gluonnlp.data.word_embedding_evaluation

# coding: utf-8

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# pylint: disable=too-many-lines
"""Word embedding evaluation datasets."""

import os
import tarfile
import zipfile

from mxnet.gluon.data.dataset import SimpleDataset
from mxnet.gluon.utils import check_sha1, _get_repo_file_url, download

from .. import _constants as C
from .dataset import CorpusDataset
from .registry import register
from ..base import get_home_dir

base_datasets = [
    'WordSimilarityEvaluationDataset', 'WordAnalogyEvaluationDataset'
]
word_similarity_datasets = [
    'WordSim353', 'MEN', 'RadinskyMTurk', 'RareWords', 'SimLex999',
    'SimVerb3500', 'SemEval17Task2', 'BakerVerb143', 'YangPowersVerb130'
]
word_analogy_datasets = ['GoogleAnalogyTestSet', 'BiggerAnalogyTestSet']
__all__ = base_datasets + word_similarity_datasets + word_analogy_datasets


class _Dataset(SimpleDataset):
    _url = None  # Dataset is retrieved from here if not cached
    _archive_file = (None, None)  # Archive name and checksum
    _checksums = None  # Checksum of archive contents
    _verify_ssl = True  # Verify SSL certificates when downloading from self._url
    _namespace = None  # Contains S3 namespace for self-hosted datasets

    def __init__(self, root):
        self.root = os.path.expanduser(root)
        if not os.path.isdir(self.root):
            os.makedirs(self.root)
        self._download_data()
        super(_Dataset, self).__init__(self._get_data())

    def _download_data(self):
        _, archive_hash = self._archive_file
        for name, checksum in self._checksums.items():
            name = name.split('/')
            path = os.path.join(self.root, *name)
            if not os.path.exists(path) or not check_sha1(path, checksum):
                if self._namespace is not None:
                    url = _get_repo_file_url(self._namespace,
                                             self._archive_file[0])
                else:
                    url = self._url
                downloaded_file_path = download(url, path=self.root,
                                                sha1_hash=archive_hash,
                                                verify_ssl=self._verify_ssl)

                if downloaded_file_path.lower().endswith('zip'):
                    with zipfile.ZipFile(downloaded_file_path, 'r') as zf:
                        zf.extractall(path=self.root)
                elif downloaded_file_path.lower().endswith('tar.gz'):
                    with tarfile.open(downloaded_file_path, 'r') as tf:
                        tf.extractall(path=self.root)
                elif len(self._checksums) > 1:
                    err = 'Failed retrieving {clsname}.'.format(
                        clsname=self.__class__.__name__)
                    err += (' Expecting multiple files, '
                            'but could not detect archive format.')
                    raise RuntimeError(err)

    def _get_data(self):
        raise NotImplementedError


###############################################################################
# Word similarity and relatedness datasets
###############################################################################
[docs]class WordSimilarityEvaluationDataset(_Dataset): """Base class for word similarity or relatedness task datasets. Inheriting classes are assumed to implement datasets of the form ['word1', 'word2', score] where score is a numerical similarity or relatedness score with respect to 'word1' and 'word2'. """ def __init__(self, root): super(WordSimilarityEvaluationDataset, self).__init__(root=root) self._cast_score_to_float() def _get_data(self): raise NotImplementedError def _cast_score_to_float(self): self._data = [[row[0], row[1], float(row[2])] for row in self._data]
[docs]@register(segment=['all', 'similarity', 'relatedness']) class WordSim353(WordSimilarityEvaluationDataset): """WordSim353 dataset. The dataset was collected by Finkelstein et al. (http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/). Agirre et al. proposed to split the collection into two datasets, one focused on measuring similarity, and the other one on relatedness (http://alfonseca.org/eng/research/wordsim353.html). - Finkelstein, L., Gabrilovich, E., Matias, Y., Rivlin, E., Solan, Z., Wolfman, G., & Ruppin, E. (2002). Placing search in context: the concept revisited. ACM} Trans. Inf. Syst., 20(1), 116–131. https://dl.acm.org/citation.cfm?id=372094 - Agirre, E., Alfonseca, E., Hall, K. B., Kravalova, J., Pasca, M., & Soroa, A. (2009). A study on similarity and relatedness using distributional and wordnet-based approaches. In , Human Language Technologies: Conference of the North American Chapter of the Association of Computational Linguistics, Proceedings, May 31 - June 5, 2009, Boulder, Colorado, {USA (pp. 19–27). : The Association for Computational Linguistics. License: Creative Commons Attribution 4.0 International (CC BY 4.0) Each sample consists of a pair of words, and a score with scale from 0 (totally unrelated words) to 10 (very much related or identical words). Parameters ---------- segment : str 'relatedness', 'similarity' or 'all' root : str, default '$MXNET_HOME/datasets/wordsim353' Path to temp folder for storing data. MXNET_HOME defaults to '~/.mxnet'. Examples -------- >>> wordsim353 = gluonnlp.data.WordSim353('similarity', root='./datasets/wordsim353') -etc- >>> len(wordsim353) 203 >>> wordsim353[0] ['Arafat', 'Jackson', 2.5] """ _url = 'http://alfonseca.org/pubs/ws353simrel.tar.gz' _namespace = 'gluon/dataset/ws353' _archive_file = ('ws353simrel.tar.gz', '1b9ca7f4d61682dea0004acbd48ce74275d5bfff') _checksums = { 'wordsim353_sim_rel/wordsim353_agreed.txt': '1c9f77c9dd42bcc09092bd32adf0a1988d03ca80', 'wordsim353_sim_rel/wordsim353_annotator1.txt': '674d5a9263d099a5128b4bf4beeaaceb80f71f4e', 'wordsim353_sim_rel/wordsim353_annotator2.txt': '9b79a91861a4f1075183b93b89b73e1b470b94c1', 'wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt': 'c36c5dc5ebea9964f4f43e2c294cd620471ab1b8', 'wordsim353_sim_rel/wordsim_similarity_goldstandard.txt': '4845df518a83c8f7c527439590ed7e4c71916a99' } _data_file = { 'relatedness': ('wordsim_relatedness_goldstandard.txt', 'c36c5dc5ebea9964f4f43e2c294cd620471ab1b8'), 'similarity': ('wordsim_similarity_goldstandard.txt', '4845df518a83c8f7c527439590ed7e4c71916a99') } min = 0 max = 10 def __init__(self, segment='all', root=os.path.join( get_home_dir(), 'datasets', 'wordsim353')): if segment is not None: assert segment in ['all', 'relatedness', 'similarity'] self.segment = segment super(WordSim353, self).__init__(root=root) def _get_data(self): paths = [] if self.segment == 'relatedness' or self.segment == 'all': paths.append( os.path.join( self.root, 'wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt')) if self.segment == 'similarity' or self.segment == 'all': paths.append( os.path.join( self.root, 'wordsim353_sim_rel/wordsim_similarity_goldstandard.txt')) return sorted(list({tuple(row) for row in CorpusDataset(paths)}))
[docs]@register(segment=['full', 'dev', 'test']) class MEN(WordSimilarityEvaluationDataset): """MEN dataset for word-similarity and relatedness. The dataset was collected by Bruni et al. (https://staff.fnwi.uva.nl/e.bruni/MEN). - Bruni, E., Boleda, G., Baroni, M., & Nam-Khanh Tran (2012). Distributional semantics in technicolor. In , The 50th Annual Meeting of the Association for Computational Linguistics, Proceedings of the Conference, July 8-14, 2012, Jeju Island, Korea - Volume 1: Long Papers (pp. 136–145). : The Association for Computer Linguistics. License: Creative Commons Attribution 2.0 Generic (CC BY 2.0) Each sample consists of a pair of words, and a score with scale from 0 (totally unrelated words) to 50 (very much related or identical words). Parameters ---------- root : str, default '$MXNET_HOME/datasets/men' Path to temp folder for storing data. MXNET_HOME defaults to '~/.mxnet'. segment : str, default 'train' Dataset segment. Options are 'train', 'dev', 'test'. Examples -------- >>> men = gluonnlp.data.MEN('test', root='./datasets/men') -etc- >>> len(men) 1000 >>> men[0] ['display', 'pond', 10.0] """ _url = 'https://staff.fnwi.uva.nl/e.bruni/resources/MEN.tar.gz' _namespace = 'gluon/dataset/men' _archive_file = ('MEN.tar.gz', '3c4af1b7009c1ad75e03562f7f7bc5f51ff3a31a') _checksums = { 'MEN/MEN_dataset_lemma_form.dev': '55d2c9675f84dc661861172fc89db437cab2ed92', 'MEN/MEN_dataset_lemma_form.test': 'c003c9fddfe0ce1d38432cdb13863599d7a2d37d', 'MEN/MEN_dataset_lemma_form_full': 'e32e0a0fa09ccf95aa898bd42011e84419f7fafb', 'MEN/MEN_dataset_natural_form_full': 'af9c2ca0033e2561676872eed98e223ee6366b82', 'MEN/agreement/agreement-score.txt': 'bee1fe16ce63a198a12a924ceb50253c49c7b45c', 'MEN/agreement/elias-men-ratings.txt': 'd180252df271de96c8fbba6693eaa16793e0f7f0', 'MEN/agreement/marcos-men-ratings.txt': 'dbfceb7d88208c2733861f27d3d444c15db18519', 'MEN/instructions.txt': 'e6f69c7338246b404bafa6e24257fc4a5ba01baa', 'MEN/licence.txt': 'f57c6d61814a0895236ab99c06b61b2611430f92' } _segment_file = { 'full': 'MEN/MEN_dataset_lemma_form_full', 'dev': 'MEN/MEN_dataset_lemma_form.dev', 'test': 'MEN/MEN_dataset_lemma_form.test', } min = 0 max = 50 def __init__(self, segment='dev', root=os.path.join( get_home_dir(), 'datasets', 'men')): self.segment = segment super(MEN, self).__init__(root=root) def _get_data(self): datafilepath = os.path.join( self.root, *self._segment_file[self.segment].split('/')) dataset = CorpusDataset(datafilepath) # Remove lemma information return [[row[0][:-2], row[1][:-2], row[2]] for row in dataset]
[docs]@register class RadinskyMTurk(WordSimilarityEvaluationDataset): """MTurk dataset for word-similarity and relatedness by Radinsky et al.. - Radinsky, K., Agichtein, E., Gabrilovich, E., & Markovitch, S. (2011). A word at a time: computing word relatedness using temporal semantic analysis. In S. Srinivasan, K. Ramamritham, A. Kumar, M. P. Ravindra, E. Bertino, & R. Kumar, Proceedings of the 20th International Conference on World Wide Web, {WWW} 2011, Hyderabad, India, March 28 - April 1, 2011 (pp. 337–346). : ACM. License: Unspecified Each sample consists of a pair of words, and a score with scale from 1 (totally unrelated words) to 5 (very much related or identical words). Parameters ---------- root : str, default '$MXNET_HOME/datasets/radinskymturk' Path to temp folder for storing data. MXNET_HOME defaults to '~/.mxnet'. Examples -------- >>> radinskymturk = gluonnlp.data.RadinskyMTurk(root='./datasets/radinskymturk') -etc- >>> len(radinskymturk) 287 >>> radinskymturk[0] ['episcopal', 'russia', 2.75] """ _url = 'http://www.kiraradinsky.com/files/Mtruk.csv' _archive_file = ('Mtruk.csv', '14959899c092148abba21401950d6957c787434c') _checksums = {'Mtruk.csv': '14959899c092148abba21401950d6957c787434c'} min = 1 max = 5 def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'radinskymturk')): super(RadinskyMTurk, self).__init__(root=root) def _get_data(self): datafilepath = os.path.join(self.root, self._archive_file[0]) dataset = CorpusDataset(datafilepath, tokenizer=lambda x: x.split(',')) return [row for row in dataset]
[docs]@register class RareWords(WordSimilarityEvaluationDataset): """Rare words dataset word-similarity and relatedness. - Luong, T., Socher, R., & Manning, C. D. (2013). Better word representations with recursive neural networks for morphology. In J. Hockenmaier, & S. Riedel, Proceedings of the Seventeenth Conference on Computational Natural Language Learning, CoNLL 2013, Sofia, Bulgaria, August 8-9, 2013 (pp. 104–113). : ACL. License: Unspecified Each sample consists of a pair of words, and a score with scale from 0 (totally unrelated words) to 10 (very much related or identical words). Parameters ---------- root : str, default '$MXNET_HOME/datasets/rarewords', MXNET_HOME defaults to '~/.mxnet'. Path to temp folder for storing data. Examples -------- >>> rarewords = gluonnlp.data.RareWords(root='./datasets/rarewords') -etc- >>> len(rarewords) 2034 >>> rarewords[0] ['squishing', 'squirt', 5.88] """ _url = 'http://www-nlp.stanford.edu/~lmthang/morphoNLM/rw.zip' _archive_file = ('rw.zip', 'bf9c5959a0a2d7ed8e51d91433ac5ebf366d4fb9') _checksums = {'rw/rw.txt': 'bafc59f099f1798b47f5bed7b0ebbb933f6b309a'} min = 0 max = 10 def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'rarewords')): super(RareWords, self).__init__(root=root) def _get_data(self): datafilepath = os.path.join(self.root, 'rw', 'rw.txt') dataset = CorpusDataset(datafilepath) return [[row[0], row[1], row[2]] for row in dataset]
[docs]@register class SimLex999(WordSimilarityEvaluationDataset): """SimLex999 dataset word-similarity. - Hill, F., Reichart, R., & Korhonen, A. (2015). Simlex-999: evaluating semantic models with (genuine) similarity estimation. Computational Linguistics, 41(4), 665–695. https://arxiv.org/abs/1408.3456 License: Unspecified Each sample consists of a pair of words, and a score with scale from 0 (totally unrelated words) to 10 (very much related or identical words). Parameters ---------- root : str, default '$MXNET_HOME/datasets/simlex999' Path to temp folder for storing data. MXNET_HOME defaults to '~/.mxnet'. Examples -------- >>> simlex999 = gluonnlp.data.SimLex999(root='./datasets/simlex999') -etc- >>> len(simlex999) 999 >>> simlex999[0] ['old', 'new', 1.58] """ _url = 'https://www.cl.cam.ac.uk/~fh295/SimLex-999.zip' _archive_file = ('SimLex-999.zip', '0d3afe35b89d60acf11c28324ac7be10253fda39') _checksums = { 'SimLex-999/README.txt': 'f54f4a93213b847eb93cc8952052d6b990df1bd1', 'SimLex-999/SimLex-999.txt': '0496761e49015bc266908ea6f8e35a5ec77cb2ee' } min = 0 max = 10 score = 'SimLex999' def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'simlex999')): super(SimLex999, self).__init__(root=root) def _get_data(self): dataset = CorpusDataset( os.path.join(self.root, 'SimLex-999', 'SimLex-999.txt')) return [[row[0], row[1], row[3]] for i, row in enumerate(dataset) if i != 0] # Throw away header
[docs]@register class SimVerb3500(WordSimilarityEvaluationDataset): """SimVerb3500 dataset word-similarity. - Hill, F., Reichart, R., & Korhonen, A. (2015). Simlex-999: evaluating semantic models with (genuine) similarity estimation. Computational Linguistics, 41(4), 665–695. https://arxiv.org/abs/1408.3456 License: Unspecified Each sample consists of a pair of words, and a score with scale from 0 (totally unrelated words) to 10 (very much related or identical words). Parameters ---------- root : str, default '$MXNET_HOME/datasets/verb3500' Path to temp folder for storing data. MXNET_HOME defaults to '~/.mxnet'. Examples -------- >>> simverb3500 = gluonnlp.data.SimVerb3500(root='./datasets/simverb3500') #doctest:+SKIP -etc- >>> len(simverb3500) #doctest:+SKIP 3500 >>> simverb3500[0] #doctest:+SKIP ['take', 'remove', 6.81] """ _url = 'https://www.aclweb.org/anthology/attachments/D16-1235.Attachment.zip' _archive_file = ('D16-1235.Attachment.zip', '7bcfff115ca3e4c909b3763a2ba35e83992f2a2f') _checksums = { 'data/README.txt': 'fc2645b30a291a7486015c3e4b51d8eb599f7c7e', 'data/SimVerb-3000-test.txt': '4cddf11f0fbbb3b94958e69b0614be5d125ec607', 'data/SimVerb-3500-ratings.txt': '133d45daeb0e73b9da26930741455856887ac17b', 'data/SimVerb-3500-stats.txt': '79a0fd7c6e03468742d276b127d70478a6995681', 'data/SimVerb-3500.txt': '0e79af04fd42f44affc93004f2a02b62f155a9ae', 'data/SimVerb-3520-annotator-ratings.csv': '9ff69cec9c93a1abba7be1404fc82d7f20e6633b', 'data/SimVerb-500-dev.txt': '3ae184352ca2d9f855ca7cb099a65635d184f75a' } _segment_file = { 'full': 'data/SimVerb-3500.txt', 'test': 'data/SimVerb-3000-test.txt', 'dev': 'data/SimVerb-500-dev.txt' } min = 0 max = 10 def __init__(self, segment='full', root=os.path.join( get_home_dir(), 'datasets', 'simverb3500')): self.segment = segment super(SimVerb3500, self).__init__(root=root) def _get_data(self): dataset = CorpusDataset( os.path.join(self.root, *self._segment_file[self.segment].split('/'))) return [[row[0], row[1], row[3]] for row in dataset]
[docs]@register(segment=['trial', 'test']) class SemEval17Task2(WordSimilarityEvaluationDataset): """SemEval17Task2 dataset for word-similarity. The dataset was collected by Finkelstein et al. (http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/). Agirre et al. proposed to split the collection into two datasets, one focused on measuring similarity, and the other one on relatedness (http://alfonseca.org/eng/research/wordsim353.html). - Finkelstein, L., Gabrilovich, E., Matias, Y., Rivlin, E., Solan, Z., Wolfman, G., & Ruppin, E. (2002). Placing search in context: the concept revisited. ACM} Trans. Inf. Syst., 20(1), 116–131. https://dl.acm.org/citation.cfm?id=372094 - Agirre, E., Alfonseca, E., Hall, K. B., Kravalova, J., Pasca, M., & Soroa, A. (2009). A study on similarity and relatedness using distributional and wordnet-based approaches. In , Human Language Technologies: Conference of the North American Chapter of the Association of Computational Linguistics, Proceedings, May 31 - June 5, 2009, Boulder, Colorado, {USA (pp. 19–27). : The Association for Computational Linguistics. License: Unspecified Each sample consists of a pair of words, and a score with scale from 0 (totally unrelated words) to 5 (very much related or identical words). Parameters ---------- root : str, default '$MXNET_HOME/datasets/semeval17task2' Path to temp folder for storing data. MXNET_HOME defaults to '~/.mxnet'. segment : str, default 'train' Dataset segment. Options are 'trial', 'test'. language : str, default 'en' Dataset language. Examples -------- >>> semeval17task2 = gluonnlp.data.SemEval17Task2(root='./datasets/semeval17task2') -etc- >>> len(semeval17task2) 18 >>> semeval17task2[0] ['sunset', 'string', 0.05] """ _url = 'http://alt.qcri.org/semeval2017/task2/data/uploads/semeval2017-task2.zip' _archive_file = ('semeval2017-task2.zip', 'b29860553f98b057303815817dfb60b9fe79cfba') _checksums = C.SEMEVAL17_CHECKSUMS _datatemplate = ('SemEval17-Task2/{segment}/subtask1-monolingual/data/' '{language}.{segment}.data.txt') _keytemplate = ('SemEval17-Task2/{segment}/subtask1-monolingual/keys/' '{language}.{segment}.gold.txt') min = 0 max = 5 segments = ('trial', 'test') languages = ('en', 'es', 'de', 'it', 'fa') def __init__(self, segment='trial', language='en', root=os.path.join( get_home_dir(), 'datasets', 'semeval17task2')): assert segment in self.segments assert language in self.languages self.language = language self.segment = segment super(SemEval17Task2, self).__init__(root=root) def _get_data(self): data = self._datatemplate.format(segment=self.segment, language=self.language) data = os.path.join(self.root, *data.split('/')) keys = self._keytemplate.format(segment=self.segment, language=self.language) keys = os.path.join(self.root, *keys.split('/')) data_dataset = CorpusDataset(data) keys_dataset = CorpusDataset(keys) return [[d[0], d[1], k[0]] for d, k in zip(data_dataset, keys_dataset)]
[docs]@register class BakerVerb143(WordSimilarityEvaluationDataset): """Verb143 dataset. - Baker, S., Reichart, R., & Korhonen, A. (2014). An unsupervised model for instance level subcategorization acquisition. In A. Moschitti, B. Pang, & W. Daelemans, Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing, {EMNLP} 2014, October 25-29, 2014, Doha, Qatar, {A} meeting of SIGDAT, a Special Interest Group of the {ACL (pp. 278–289). : ACL. 144 pairs of verbs annotated by 10 annotators following the WS-353 guidelines. License: unspecified Each sample consists of a pair of words, and a score with scale from 0 (totally unrelated words) to 1 (very much related or identical words). Parameters ---------- root : str, default '$MXNET_HOME/datasets/verb143' Path to temp folder for storing data. MXNET_HOME defaults to '~/.mxnet'. Examples -------- >>> bakerverb143 = gluonnlp.data.BakerVerb143(root='./datasets/bakerverb143') #doctest:+SKIP -etc- >>> len(bakerverb143) #doctest:+SKIP 144 >>> bakerverb143[0] #doctest:+SKIP ['happen', 'say', 0.19] """ _url = 'https://ie.technion.ac.il/~roiri/papers/EMNLP14.zip' _archive_file = ('EMNLP14.zip', '1862e52af784e76e83d472532a75eb797fb8b807') _checksums = { 'verb_similarity dataset.txt': 'd7e4820c7504cbae56898353e4d94e6408c330fc' } _verify_ssl = False # ie.technion.ac.il serves an invalid cert as of 2018-04-16 min = 0 max = 1 def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'verb143')): super(BakerVerb143, self).__init__(root=root) def _get_data(self): path = os.path.join(self.root, 'verb_similarity dataset.txt') dataset = CorpusDataset(path) return [[row[0], row[1], row[12]] for row in dataset]
[docs]@register class YangPowersVerb130(WordSimilarityEvaluationDataset): """Verb-130 dataset. - Yang, D., & Powers, D. M. (2006). Verb similarity on the taxonomy of wordnet. In The Third International WordNet Conference: GWC 2006 License: Unspecified Each sample consists of a pair of words, and a score with scale from 0 (totally unrelated words) to 4 (very much related or identical words). Parameters ---------- root : str, default '$MXNET_HOME/datasets/verb130' Path to temp folder for storing data. MXNET_HOME defaults to '~/.mxnet'. Examples -------- >>> yangpowersverb130 = gluonnlp.data.YangPowersVerb130(root='./datasets/yangpowersverb130') >>> len(yangpowersverb130) 130 >>> yangpowersverb130[0] ['brag', 'boast', 4.0] """ _words1 = [ 'brag', 'concoct', 'divide', 'build', 'end', 'accentuate', 'demonstrate', 'solve', 'consume', 'position', 'swear', 'furnish', 'merit', 'submit', 'seize', 'spin', 'enlarge', 'swing', 'circulate', 'recognize', 'resolve', 'prolong', 'tap', 'block', 'arrange', 'twist', 'hail', 'dissipate', 'approve', 'impose', 'hasten', 'rap', 'lean', 'make', 'show', 'sell', 'weave', 'refer', 'distribute', 'twist', 'drain', 'depict', 'build', 'hail', 'call', 'swing', 'yield', 'split', 'challenge', 'hinder', 'welcome', 'need', 'refer', 'finance', 'expect', 'terminate', 'yell', 'swell', 'rotate', 'seize', 'approve', 'supply', 'clip', 'divide', 'advise', 'complain', 'want', 'twist', 'swing', 'make', 'hinder', 'build', 'express', 'resolve', 'bruise', 'swing', 'catch', 'swear', 'request', 'arrange', 'relieve', 'move', 'weave', 'swear', 'forget', 'supervise', 'situate', 'explain', 'ache', 'evaluate', 'recognize', 'dilute', 'hasten', 'scorn', 'swear', 'arrange', 'discard', 'list', 'stamp', 'market', 'boil', 'sustain', 'resolve', 'dissipate', 'anger', 'approve', 'research', 'request', 'boast', 'furnish', 'refine', 'acknowledge', 'clean', 'lean', 'postpone', 'hail', 'remember', 'scrape', 'sweat', 'highlight', 'seize', 'levy', 'alter', 'refer', 'empty', 'flush', 'shake', 'imitate', 'correlate', 'refer' ] _words2 = [ 'boast', 'devise', 'split', 'construct', 'terminate', 'highlight', 'show', 'figure', 'eat', 'situate', 'vow', 'supply', 'deserve', 'yield', 'take', 'twirl', 'swell', 'sway', 'distribute', 'acknowledge', 'settle', 'sustain', 'knock', 'hinder', 'plan', 'curl', 'acclaim', 'disperse', 'support', 'levy', 'accelerate', 'tap', 'rest', 'earn', 'publish', 'market', 'intertwine', 'direct', 'commercialize', 'intertwine', 'tap', 'recognize', 'organize', 'address', 'refer', 'bounce', 'seize', 'crush', 'yield', 'assist', 'recognize', 'deserve', 'explain', 'build', 'deserve', 'postpone', 'boast', 'curl', 'situate', 'request', 'scorn', 'consume', 'twist', 'figure', 'furnish', 'boast', 'deserve', 'fasten', 'crash', 'trade', 'yield', 'propose', 'figure', 'examine', 'split', 'break', 'consume', 'explain', 'levy', 'study', 'hinder', 'swell', 'print', 'think', 'resolve', 'concoct', 'isolate', 'boast', 'spin', 'terminate', 'succeed', 'market', 'permit', 'yield', 'describe', 'explain', 'arrange', 'figure', 'weave', 'sweeten', 'tap', 'lower', 'publicize', 'isolate', 'approve', 'boast', 'distribute', 'concoct', 'yield', 'impress', 'sustain', 'distribute', 'concoct', 'grate', 'show', 'judge', 'hail', 'lean', 'spin', 'restore', 'refer', 'believe', 'highlight', 'carry', 'situate', 'spin', 'swell', 'highlight', 'levy', 'lean' ] _url = ('https://dspace2.flinders.edu.au/xmlui/bitstream/handle/' '2328/9557/Yang%20Verb.pdf?sequence=1') min = 0 max = 4 def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'verb130')): super(YangPowersVerb130, self).__init__(root=root) def _get_data(self): scores = [4] * 26 + [3] * 26 + [2] * 26 + [1] * 26 + [0] * 26 return list(zip(self._words1, self._words2, scores)) def _download_data(self): # Overwrite download method as this dataset is self-contained pass
############################################################################### # Word analogy datasets ###############################################################################
[docs]class WordAnalogyEvaluationDataset(_Dataset): """Base class for word analogy task datasets. Inheriting classes are assumed to implement datasets of the form ['word1', 'word2', 'word3', 'word4'] or ['word1', [ 'word2a', 'word2b', ... ], 'word3', [ 'word4a', 'word4b', ... ]]. """ def _get_data(self): raise NotImplementedError
[docs]@register(category=C.GOOGLEANALOGY_CATEGORIES) class GoogleAnalogyTestSet(WordAnalogyEvaluationDataset): """Google analogy test set - Mikolov, T., Chen, K., Corrado, G., & Dean, J. (2013). Efficient estimation of word representations in vector space. In Proceedings of the International Conference on Learning Representations (ICLR). License: Unspecified Each sample consists of two analogical pairs of words. Parameters ---------- group : {'syntactic', 'semantic'} or None, default None The subset for the specified type of analogy. None for the complete dataset. category : str or None, default None The subset for the specified category of analogy. None for the complete dataset. lowercase : boolean, default True Whether to convert words to lowercase. root : str, default '$MXNET_HOME/datasets/google_analogy' Path to temp folder for storing data. MXNET_HOME defaults to '~/.mxnet'. Examples -------- >>> googleanalogytestset = gluonnlp.data.GoogleAnalogyTestSet( ... root='./datasets/googleanalogytestset') -etc- >>> len(googleanalogytestset) 19544 >>> googleanalogytestset[0] ['athens', 'greece', 'baghdad', 'iraq'] >>> googleanalogytestset = gluonnlp.data.GoogleAnalogyTestSet( ... 'syntactic', root='./datasets/googleanalogytestset') >>> googleanalogytestset[0] ['amazing', 'amazingly', 'apparent', 'apparently'] >>> googleanalogytestset = gluonnlp.data.GoogleAnalogyTestSet( ... 'syntactic', 'gram8-plural', root='./datasets/googleanalogytestset') >>> googleanalogytestset[0] ['banana', 'bananas', 'bird', 'birds'] """ _archive_file = ('questions-words.txt', 'fa92df4bbe788f2d51827c762c63bd8e470edf31') _checksums = { 'questions-words.txt': 'fa92df4bbe788f2d51827c762c63bd8e470edf31' } _url = 'http://download.tensorflow.org/data/questions-words.txt' groups = ['syntactic', 'semantic'] categories = C.GOOGLEANALOGY_CATEGORIES def __init__(self, group=None, category=None, lowercase=True, root=os.path.join( get_home_dir(), 'datasets', 'google_analogy')): assert group is None or group in self.groups assert category is None or category in self.categories self.category = category self.group = group self.lowercase = lowercase super(GoogleAnalogyTestSet, self).__init__(root=root) def _get_data(self): words = [] with open(os.path.join(self.root, self._archive_file[0])) as f: for line in f: if line.startswith(':'): current_category = line.split()[1] if 'gram' in current_category: current_group = 'syntactic' else: current_group = 'semantic' else: if self.group is not None and self.group != current_group: continue if self.category is not None and self.category != current_category: continue if self.lowercase: line = line.lower() words.append(line.split()) return words
[docs]@register(category=list(C.BATS_CATEGORIES.keys())) class BiggerAnalogyTestSet(WordAnalogyEvaluationDataset): """Bigger analogy test set - Gladkova, A., Drozd, A., & Matsuoka, S. (2016). Analogy-based detection of morphological and semantic relations with word embeddings: what works and what doesn’t. In Proceedings of the NAACL-HLT SRW (pp. 47–54). San Diego, California, June 12-17, 2016: ACL. Retrieved from https://www.aclweb.org/anthology/N/N16/N16-2002.pdf License: Unspecified Each sample consists of two analogical pairs of words. Parameters ---------- root : str, default '$MXNET_HOME/datasets/bats' Path to temp folder for storing data. MXNET_HOME defaults to '~/.mxnet'. Examples -------- >>> biggeranalogytestset = gluonnlp.data.BiggerAnalogyTestSet( ... root='./datasets/biggeranalogytestset') -etc- >>> len(biggeranalogytestset) 98000 >>> biggeranalogytestset[0] ['album', 'albums', 'application', 'applications'] """ _archive_file = ('BATS_3.0.zip', 'bf94d47884be9ea83af369beeea7499ed25dcf0d') _checksums = C.BATS_CHECKSUMS _url = 'https://s3.amazonaws.com/blackbirdprojects/tut_vsm/BATS_3.0.zip' _category_group_map = { 'I': '1_Inflectional_morphology', 'D': '2_Derivational_morphology', 'E': '3_Encyclopedic_semantics', 'L': '4_Lexicographic_semantics' } _categories = C.BATS_CATEGORIES def __init__(self, category=None, form_analogy_pairs=True, drop_alternative_solutions=True, root=os.path.join( get_home_dir(), 'datasets', 'bigger_analogy')): self.form_analogy_pairs = form_analogy_pairs self.drop_alternative_solutions = drop_alternative_solutions self.category = category if self.category is not None: assert self.category in self._categories.keys() super(BiggerAnalogyTestSet, self).__init__(root=root) def _get_data(self): if self.category is not None: categories = [self.category] else: categories = self._categories.keys() datasets = [] for category in categories: group = self._category_group_map[category[0]] category_name = self._categories[category] path = os.path.join( self.root, *('BATS_3.0/{group}/{category} {category_name}.txt'.format( group=group, category=category, category_name=category_name).split('/'))) dataset = CorpusDataset(path) dataset = [[row[0], row[1].split('/')] for row in dataset] # Drop alternative solutions seperated by '/' from word2 column if self.drop_alternative_solutions: dataset = [[row[0], row[1][0]] for row in dataset] # Final dataset consists of all analogy pairs per category if self.form_analogy_pairs: dataset = [[arow[0], arow[1], brow[0], brow[1]] for arow in dataset for brow in dataset if arow != brow] datasets += dataset return datasets