Publications
2025
- ManuscriptA Distributional Perspective on Word Learning in Neural Language ModelsFilippo Ficarra, Ryan Cotterell, and Alex Warstadt2025
@unpublished{ficarra2025disributional, title = {{A Distributional Perspective on Word Learning in Neural Language Models}}, author = {Ficarra, Filippo and Cotterell, Ryan and Warstadt, Alex}, year = {2025}, annote = {Unpublished manuscript}, }
- ManuscriptCan Language Models Learn Typologically Implausible Languages?Tianyang Xu, Tatsuki Kuribayashi, Yohei Oseki, and 2 more authors2025
@unpublished{xu2025implausible, title = {{Can Language Models Learn Typologically Implausible Languages?}}, author = {Xu, Tianyang and Kuribayashi, Tatsuki and Oseki, Yohei and Cotterell, Ryan and Warstadt, Alex}, year = {2025}, annote = {Unpublished manuscript}, }
2024
- TACL (to appear)Investigating Critical Period Effects in Language Acquisition through Neural Language ModelsIonut Constantinescu, Tiago Pimentel, Ryan Cotterell, and 1 more authorTransactions of the Association for Computational Linguistics, 2024
Humans appear to have a critical period (CP) for language acquisition: Second language (L2) acquisition becomes harder after early childhood, and ceasing exposure to a first language (L1) after this period (but not before) typically does not lead to substantial loss of L1 proficiency. It is unknown whether these CP effects result from innately determined brain maturation or as a stabilization of neural connections naturally induced by experience. In this study, we use language models (LMs) to test the extent to which these phenomena are peculiar to humans, or shared by a broader class of language learners. We vary the age of exposure by training LMs on language pairs in various experimental conditions, and find that LMs, which lack any direct analog to innate maturational stages, do not show CP effects when the age of exposure of L2 is delayed. Our results contradict the claim that CP effects are an inevitable result of statistical learning, and they are consistent with an innate mechanism for CP effects. We show that we can reverse-engineer the CP by introducing a regularizer partway through training to simulate a maturational decrease in plasticity. All in all, our results suggest that L1 learning on its own may not be enough to induce a CP, and additional engineering is necessary to make language models more cognitively plausible.
@article{constantinescu2024critical, title = {Investigating {Critical} {Period} {Effects} in {Language} {Acquisition} through {Neural} {Language} {Models}}, language = {en}, urldate = {2024-10-23}, journal = {Transactions of the Association for Computational Linguistics}, author = {Constantinescu, Ionut and Pimentel, Tiago and Cotterell, Ryan and Warstadt, Alex}, year = {2024}, keywords = {Computer Science - Computation and Language}, }
- COLINGAutomatic annotation of grammaticality in child-caregiver conversationsMitja Nikolaus, Abhishek Agrawal, Petros Kaklamanis, and 2 more authorsIn Proceedings of the 2024 joint international conference on computational linguistics, language resources and evaluation (LREC-COLING 2024), May 2024
The acquisition of grammar has been a central question to adjudicate between theories of language acquisition. In order to conduct faster, more reproducible, and larger-scale corpus studies on grammaticality in child-caregiver conversations, tools for automatic annotation can offer an effective alternative to tedious manual annotation. We propose a coding scheme for context-dependent grammaticality in child-caregiver conversations and annotate more than 4,000 utterances from a large corpus of transcribed conversations. Based on these annotations, we train and evaluate a range of NLP models. Our results show that fine-tuned Transformer-based models perform best, achieving human inter-annotation agreement levels. As a first application and sanity check of this tool, we use the trained models to annotate a corpus almost two orders of magnitude larger than the manually annotated data and verify that children’s grammaticality shows a steady increase with age. This work contributes to the growing literature on applying state-of-the-art NLP methods to help study child language acquisition at scale.
@inproceedings{nikolaus2024automatic, address = {Torino, Italia}, title = {Automatic annotation of grammaticality in child-caregiver conversations}, booktitle = {Proceedings of the 2024 joint international conference on computational linguistics, language resources and evaluation ({LREC}-{COLING} 2024)}, publisher = {ELRA and ICCL}, author = {Nikolaus, Mitja and Agrawal, Abhishek and Kaklamanis, Petros and Warstadt, Alex and Fourtassi, Abdellah}, editor = {Calzolari, Nicoletta and Kan, Min-Yen and Hoste, Veronique and Lenci, Alessandro and Sakti, Sakriani and Xue, Nianwen}, month = may, year = {2024}, pages = {1832--1844}, }
- Under ReviewBigger is not always better: The importance of human-scale language modeling for psycholinguisticsEthan Gotlieb Wilcox, Michael Hu, Aaron Mueller, and 6 more authorsMay 2024
@unpublished{wilcox2024bigger, title = {Bigger is not always better: {The} importance of human-scale language modeling for psycholinguistics}, author = {Wilcox, Ethan Gotlieb and Hu, Michael and Mueller, Aaron and Linzen, Tal and Warstadt, Alex and Choshen, Leshem and Zhuang, Chengxu and Cotterell, Ryan and Williams, Adina}, year = {2024}, }
- EMNLPSurprise! Uniform Information Density Isn’t the Whole Story: Predicting Surprisal Contours in Long-form DiscourseEleftheria Tsipidi, Franz Nowak, Ryan Cotterell, and 3 more authorsIn Proceedings of the 2023 conference on empirical methods in natural language processing (EMNLP), Nov 2024
The Uniform Information Density (UID) hypothesis posits that speakers tend to distribute information evenly across linguistic units to achieve efficient communication. Of course, information rate in texts and discourses is not perfectly uniform. While these fluctuations can be viewed as theoretically uninteresting noise on top of a uniform target, another explanation is that UID is not the only functional pressure regulating information content in a language. Speakers may also seek to maintain interest, adhere to writing conventions, and build compelling arguments. In this paper, we propose one such functional pressure; namely that speakers modulate information rate based on location within a hierarchically-structured model of discourse. We term this the Structured Context Hypothesis and test it by predicting the surprisal contours of naturally occurring discourses extracted from large language models using predictors derived from discourse structure. We find that hierarchical predictors are significant predictors of a discourse’s information contour and that deeply nested hierarchical predictors are more predictive than shallow ones. This work takes an initial step beyond UID to propose testable hypotheses for why the information rate fluctuates in predictable ways
@inproceedings{tsipidi2024surprise, address = {Miami, USA}, title = {{S}urprise! {U}niform {I}nformation {D}ensity Isn't the Whole Story: {P}redicting Surprisal Contours in Long-form Discourse}, doi = {10.18653/v1/2023.findings-acl.523}, booktitle = {Proceedings of the 2023 conference on empirical methods in natural language processing ({EMNLP})}, publisher = {Association for Computational Linguistics}, author = {Tsipidi, Eleftheria and Nowak, Franz and Cotterell, Ryan and Wilcox, Ethan and Giulianelli, Mario and Warstadt, Alex}, month = nov, year = {2024}, }
2023
- EMNLPQuantifying the redundancy between prosody and textLukas Wolf, Tiago Pimentel, Evelina Fedorenko, and 4 more authorsIn Proceedings of the 2023 conference on empirical methods in natural language processing (EMNLP), Dec 2023
Prosody—the suprasegmental component of speech, including pitch, loudness, and tempo—carries critical aspects of meaning. However, the relationship between the information conveyed by prosody vs. by the words themselves remains poorly understood. We use large language models (LLMs) to estimate how much information is redundant between prosody and the words themselves. Using a large spoken corpus of English audiobooks, we extract prosodic features aligned to individual words and test how well they can be predicted from LLM embeddings, compared to non-contextual word embeddings. We find a high degree of redundancy between the information carried by the words and prosodic information across several prosodic features, including intensity, duration, pauses, and pitch contours. Furthermore, a word’s prosodic information is redundant with both the word itself and the context preceding as well as following it. Still, we observe that prosodic features can not be fully predicted from text, suggesting that prosody carries information above and beyond the words. Along with this paper, we release a general-purpose data processing pipeline for quantifying the relationship between linguistic information and extra-linguistic features.
@inproceedings{wolf2023quantifying, address = {Singapore}, title = {Quantifying the redundancy between prosody and text}, doi = {10.18653/v1/2023.emnlp-main.606}, booktitle = {Proceedings of the 2023 conference on empirical methods in natural language processing ({EMNLP})}, publisher = {Association for Computational Linguistics}, author = {Wolf, Lukas and Pimentel, Tiago and Fedorenko, Evelina and Cotterell, Ryan and Warstadt, Alex and Wilcox, Ethan and Regev, Tamar}, editor = {Bouamor, Houda and Pino, Juan and Bali, Kalika}, month = dec, year = {2023}, pages = {9765--9784}, }
- Under ReviewA geometric notion of causal probingClément Guerner, Anej Svete, Tianyu Liu, and 2 more authorsarXiv preprint arXiv:2307.15054, Dec 2023
@article{guerner2023geometric, title = {A geometric notion of causal probing}, journal = {arXiv preprint arXiv:2307.15054}, author = {Guerner, Clément and Svete, Anej and Liu, Tianyu and Warstadt, Alexander and Cotterell, Ryan}, year = {2023}, }
- ACLGeneralizing backpropagation for gradient-based interpretabilityKevin Du, Lucas Torroba Hennigen, Niklas Stoehr, and 2 more authorsIn Proceedings of the 61st annual meeting of the association for computational linguistics (volume 1: Long papers), Jul 2023
Many popular feature-attribution methods for interpreting deep neural networks rely on computing the gradients of a model’s output with respect to its inputs. While these methods can indicate which input features may be important for the model’s prediction, they reveal little about the inner workings of the model itself. In this paper, we observe that the gradient computation of a model is a special case of a more general formulation using semirings. This observation allows us to generalize the backpropagation algorithm to efficiently compute other interpretable statistics about the gradient graph of a neural network, such as the highest-weighted path and entropy. We implement this generalized algorithm, evaluate it on synthetic datasets to better understand the statistics it computes, and apply it to study BERT’s behavior on the subject–verb number agreement task (SVA). With this method, we (a) validate that the amount of gradient flow through a component of a model reflects its importance to a prediction and (b) for SVA, identify which pathways of the self-attention mechanism are most important.
@inproceedings{du2023generalizing, address = {Toronto, Canada}, title = {Generalizing backpropagation for gradient-based interpretability}, doi = {10.18653/v1/2023.acl-long.669}, booktitle = {Proceedings of the 61st annual meeting of the association for computational linguistics (volume 1: {Long} papers)}, publisher = {Association for Computational Linguistics}, author = {Du, Kevin and Torroba Hennigen, Lucas and Stoehr, Niklas and Warstadt, Alex and Cotterell, Ryan}, editor = {Rogers, Anna and Boyd-Graber, Jordan and Okazaki, Naoaki}, month = jul, year = {2023}, pages = {11979--11995}, }
- TMLRBeyond the Imitation Game: Quantifying and extrapolating the capabilities of language modelsAarohi Srivastava, Abhinav Rastogi, Abhishek Rao, and 447 more authorsTransactions on Machine Learning Research, Jul 2023
@article{srivastava2023beyond, title = {Beyond the {Imitation} {Game}: {Quantifying} and extrapolating the capabilities of language models}, issn = {2835-8856}, journal = {Transactions on Machine Learning Research}, author = {Srivastava, Aarohi and Rastogi, Abhinav and Rao, Abhishek and Shoeb, Abu Awal Md and Abid, Abubakar and Fisch, Adam and Brown, Adam R. and Santoro, Adam and Gupta, Aditya and Garriga-Alonso, Adrià and Kluska, Agnieszka and Lewkowycz, Aitor and Agarwal, Akshat and Power, Alethea and Ray, Alex and Warstadt, Alex and Kocurek, Alexander W. and Safaya, Ali and Tazarv, Ali and Xiang, Alice and Parrish, Alicia and Nie, Allen and Hussain, Aman and Askell, Amanda and Dsouza, Amanda and Slone, Ambrose and Rahane, Ameet and Iyer, Anantharaman S. and Andreassen, Anders Johan and Madotto, Andrea and Santilli, Andrea and Stuhlmüller, Andreas and Dai, Andrew M. and La, Andrew and Lampinen, Andrew and Zou, Andy and Jiang, Angela and Chen, Angelica and Vuong, Anh and Gupta, Animesh and Gottardi, Anna and Norelli, Antonio and Venkatesh, Anu and Gholamidavoodi, Arash and Tabassum, Arfa and Menezes, Arul and Kirubarajan, Arun and Mullokandov, Asher and Sabharwal, Ashish and Herrick, Austin and Efrat, Avia and Erdem, Aykut and Karakaş, Ayla and Roberts, B. Ryan and Loe, Bao Sheng and Zoph, Barret and Bojanowski, Bartłomiej and Özyurt, Batuhan and Hedayatnia, Behnam and Neyshabur, Behnam and Inden, Benjamin and Stein, Benno and Ekmekci, Berk and Lin, Bill Yuchen and Howald, Blake and Orinion, Bryan and Diao, Cameron and Dour, Cameron and Stinson, Catherine and Argueta, Cedrick and Ferri, Cesar and Singh, Chandan and Rathkopf, Charles and Meng, Chenlin and Baral, Chitta and Wu, Chiyu and Callison-Burch, Chris and Waites, Christopher and Voigt, Christian and Manning, Christopher D and Potts, Christopher and Ramirez, Cindy and Rivera, Clara E. and Siro, Clemencia and Raffel, Colin and Ashcraft, Courtney and Garbacea, Cristina and Sileo, Damien and Garrette, Dan and Hendrycks, Dan and Kilman, Dan and Roth, Dan and Freeman, C. Daniel and Khashabi, Daniel and Levy, Daniel and González, Daniel Moseguı́ and Perszyk, Danielle and Hernandez, Danny and Chen, Danqi and Ippolito, Daphne and Gilboa, Dar and Dohan, David and Drakard, David and Jurgens, David and Datta, Debajyoti and Ganguli, Deep and Emelin, Denis and Kleyko, Denis and Yuret, Deniz and Chen, Derek and Tam, Derek and Hupkes, Dieuwke and Misra, Diganta and Buzan, Dilyar and Mollo, Dimitri Coelho and Yang, Diyi and Lee, Dong-Ho and Schrader, Dylan and Shutova, Ekaterina and Cubuk, Ekin Dogus and Segal, Elad and Hagerman, Eleanor and Barnes, Elizabeth and Donoway, Elizabeth and Pavlick, Ellie and Rodolà, Emanuele and Lam, Emma and Chu, Eric and Tang, Eric and Erdem, Erkut and Chang, Ernie and Chi, Ethan A and Dyer, Ethan and Jerzak, Ethan and Kim, Ethan and Manyasi, Eunice Engefu and Zheltonozhskii, Evgenii and Xia, Fanyue and Siar, Fatemeh and Martı́nez-Plumed, Fernando and Happé, Francesca and Chollet, Francois and Rong, Frieda and Mishra, Gaurav and Winata, Genta Indra and de Melo, Gerard and Kruszewski, Germán and Parascandolo, Giambattista and Mariani, Giorgio and Wang, Gloria Xinyue and Jaimovitch-Lopez, Gonzalo and Betz, Gregor and Gur-Ari, Guy and Galijasevic, Hana and Kim, Hannah and Rashkin, Hannah and Hajishirzi, Hannaneh and Mehta, Harsh and Bogar, Hayden and Shevlin, Henry Francis Anthony and Schuetze, Hinrich and Yakura, Hiromu and Zhang, Hongming and Wong, Hugh Mee and Ng, Ian and Noble, Isaac and Jumelet, Jaap and Geissinger, Jack and Kernion, Jackson and Hilton, Jacob and Lee, Jaehoon and Fisac, Jaime Fernández and Simon, James B and Koppel, James and Zheng, James and Zou, James and Kocon, Jan and Thompson, Jana and Wingfield, Janelle and Kaplan, Jared and Radom, Jarema and Sohl-Dickstein, Jascha and Phang, Jason and Wei, Jason and Yosinski, Jason and Novikova, Jekaterina and Bosscher, Jelle and Marsh, Jennifer and Kim, Jeremy and Taal, Jeroen and Engel, Jesse and Alabi, Jesujoba and Xu, Jiacheng and Song, Jiaming and Tang, Jillian and Waweru, Joan and Burden, John and Miller, John and Balis, John U. and Batchelder, Jonathan and Berant, Jonathan and Frohberg, Jörg and Rozen, Jos and Hernandez-Orallo, Jose and Boudeman, Joseph and Guerr, Joseph and Jones, Joseph and Tenenbaum, Joshua B. and Rule, Joshua S. and Chua, Joyce and Kanclerz, Kamil and Livescu, Karen and Krauth, Karl and Gopalakrishnan, Karthik and Ignatyeva, Katerina and Markert, Katja and Dhole, Kaustubh and Gimpel, Kevin and Omondi, Kevin and Mathewson, Kory Wallace and Chiafullo, Kristen and Shkaruta, Ksenia and Shridhar, Kumar and McDonell, Kyle and Richardson, Kyle and Reynolds, Laria and Gao, Leo and Zhang, Li and Dugan, Liam and Qin, Lianhui and Contreras-Ochando, Lidia and Morency, Louis-Philippe and Moschella, Luca and Lam, Lucas and Noble, Lucy and Schmidt, Ludwig and He, Luheng and Oliveros-Colón, Luis and Metz, Luke and Senel, Lütfi Kerem and Bosma, Maarten and Sap, Maarten and Hoeve, Maartje Ter and Farooqi, Maheen and Faruqui, Manaal and Mazeika, Mantas and Baturan, Marco and Marelli, Marco and Maru, Marco and Ramirez-Quintana, Maria Jose and Tolkiehn, Marie and Giulianelli, Mario and Lewis, Martha and Potthast, Martin and Leavitt, Matthew L and Hagen, Matthias and Schubert, Mátyás and Baitemirova, Medina Orduna and Arnaud, Melody and McElrath, Melvin and Yee, Michael Andrew and Cohen, Michael and Gu, Michael and Ivanitskiy, Michael and Starritt, Michael and Strube, Michael and Swędrowski, Michał and Bevilacqua, Michele and Yasunaga, Michihiro and Kale, Mihir and Cain, Mike and Xu, Mimee and Suzgun, Mirac and Walker, Mitch and Tiwari, Mo and Bansal, Mohit and Aminnaseri, Moin and Geva, Mor and Gheini, Mozhdeh and T, Mukund Varma and Peng, Nanyun and Chi, Nathan Andrew and Lee, Nayeon and Krakover, Neta Gur-Ari and Cameron, Nicholas and Roberts, Nicholas and Doiron, Nick and Martinez, Nicole and Nangia, Nikita and Deckers, Niklas and Muennighoff, Niklas and Keskar, Nitish Shirish and Iyer, Niveditha S. and Constant, Noah and Fiedel, Noah and Wen, Nuan and Zhang, Oliver and Agha, Omar and Elbaghdadi, Omar and Levy, Omer and Evans, Owain and Casares, Pablo Antonio Moreno and Doshi, Parth and Fung, Pascale and Liang, Paul Pu and Vicol, Paul and Alipoormolabashi, Pegah and Liao, Peiyuan and Liang, Percy and Chang, Peter W and Eckersley, Peter and Htut, Phu Mon and Hwang, Pinyu and Miłkowski, Piotr and Patil, Piyush and Pezeshkpour, Pouya and Oli, Priti and Mei, Qiaozhu and Lyu, Qing and Chen, Qinlang and Banjade, Rabin and Rudolph, Rachel Etta and Gabriel, Raefer and Habacker, Rahel and Risco, Ramon and Millière, Raphaël and Garg, Rhythm and Barnes, Richard and Saurous, Rif A. and Arakawa, Riku and Raymaekers, Robbe and Frank, Robert and Sikand, Rohan and Novak, Roman and Sitelew, Roman and Bras, Ronan Le and Liu, Rosanne and Jacobs, Rowan and Zhang, Rui and Salakhutdinov, Russ and Chi, Ryan Andrew and Lee, Seungjae Ryan and Stovall, Ryan and Teehan, Ryan and Yang, Rylan and Singh, Sahib and Mohammad, Saif M. and Anand, Sajant and Dillavou, Sam and Shleifer, Sam and Wiseman, Sam and Gruetter, Samuel and Bowman, Samuel R. and Schoenholz, Samuel Stern and Han, Sanghyun and Kwatra, Sanjeev and Rous, Sarah A. and Ghazarian, Sarik and Ghosh, Sayan and Casey, Sean and Bischoff, Sebastian and Gehrmann, Sebastian and Schuster, Sebastian and Sadeghi, Sepideh and Hamdan, Shadi and Zhou, Sharon and Srivastava, Shashank and Shi, Sherry and Singh, Shikhar and Asaadi, Shima and Gu, Shixiang Shane and Pachchigar, Shubh and Toshniwal, Shubham and Upadhyay, Shyam and Debnath, Shyamolima Shammie and Shakeri, Siamak and Thormeyer, Simon and Melzi, Simone and Reddy, Siva and Makini, Sneha Priscilla and Lee, Soo-Hwan and Torene, Spencer and Hatwar, Sriharsha and Dehaene, Stanislas and Divic, Stefan and Ermon, Stefano and Biderman, Stella and Lin, Stephanie and Prasad, Stephen and Piantadosi, Steven and Shieber, Stuart and Misherghi, Summer and Kiritchenko, Svetlana and Mishra, Swaroop and Linzen, Tal and Schuster, Tal and Li, Tao and Yu, Tao and Ali, Tariq and Hashimoto, Tatsunori and Wu, Te-Lin and Desbordes, Théo and Rothschild, Theodore and Phan, Thomas and Wang, Tianle and Nkinyili, Tiberius and Schick, Timo and Kornev, Timofei and Tunduny, Titus and Gerstenberg, Tobias and Chang, Trenton and Neeraj, Trishala and Khot, Tushar and Shultz, Tyler and Shaham, Uri and Misra, Vedant and Demberg, Vera and Nyamai, Victoria and Raunak, Vikas and Ramasesh, Vinay Venkatesh and prabhu, vinay uday and Padmakumar, Vishakh and Srikumar, Vivek and Fedus, William and Saunders, William and Zhang, William and Vossen, Wout and Ren, Xiang and Tong, Xiaoyu and Zhao, Xinran and Wu, Xinyi and Shen, Xudong and Yaghoobzadeh, Yadollah and Lakretz, Yair and Song, Yangqiu and Bahri, Yasaman and Choi, Yejin and Yang, Yichi and Hao, Yiding and Chen, Yifu and Belinkov, Yonatan and Hou, Yu and Hou, Yufang and Bai, Yuntao and Seid, Zachary and Zhao, Zhuoye and Wang, Zijian and Wang, Zijie J. and Wang, Zirui and Wu, Ziyi}, year = {2023}, }
- BabyLMWhisBERT: Multimodal text-audio language modeling on 100M wordsLukas Wolf, Klemen Kotar, Greta Tuckute, and 4 more authorsIn Proceedings of the BabyLM challenge at the 27th conference on computational natural language learning, Dec 2023
@inproceedings{wolf2023whisbert, address = {Singapore}, title = {{WhisBERT}: {Multimodal} text-audio language modeling on {100M} words}, doi = {10.18653/v1/2023.conll-babylm.21}, booktitle = {Proceedings of the {BabyLM} challenge at the 27th conference on computational natural language learning}, publisher = {Association for Computational Linguistics}, author = {Wolf, Lukas and Kotar, Klemen and Tuckute, Greta and Hosseini, Eghbal and I. Regev, Tamar and Gotlieb Wilcox, Ethan and Warstadt, Alexander Scott}, editor = {Warstadt, Alex and Mueller, Aaron and Choshen, Leshem and Wilcox, Ethan and Zhuang, Chengxu and Ciro, Juan and Mosquera, Rafael and Paranjabe, Bhargavi and Williams, Adina and Linzen, Tal and Cotterell, Ryan}, month = dec, year = {2023}, pages = {253--258}, }
- BabyLMFindings of the BabyLM challenge: Sample-efficient pretraining on developmentally plausible corporaAlex Warstadt, Aaron Mueller, Leshem Choshen, and 8 more authorsIn Proceedings of the BabyLM challenge at the 27th conference on computational natural language learning, Dec 2023
@inproceedings{warstadt2023findings, address = {Singapore}, title = {Findings of the {BabyLM} challenge: {Sample}-efficient pretraining on developmentally plausible corpora}, doi = {10.18653/v1/2023.conll-babylm.1}, booktitle = {Proceedings of the {BabyLM} challenge at the 27th conference on computational natural language learning}, publisher = {Association for Computational Linguistics}, author = {Warstadt, Alex and Mueller, Aaron and Choshen, Leshem and Wilcox, Ethan and Zhuang, Chengxu and Ciro, Juan and Mosquera, Rafael and Paranjabe, Bhargavi and Williams, Adina and Linzen, Tal and Cotterell, Ryan}, editor = {Warstadt, Alex and Mueller, Aaron and Choshen, Leshem and Wilcox, Ethan and Zhuang, Chengxu and Ciro, Juan and Mosquera, Rafael and Paranjabe, Bhargavi and Williams, Adina and Linzen, Tal and Cotterell, Ryan}, month = dec, year = {2023}, pages = {1--34}, }
- BabyLMAcquiring linguistic knowledge from multimodal inputTheodor Amariucai, and Alexander Scott WarstadtIn Proceedings of the BabyLM challenge at the 27th conference on computational natural language learning, Dec 2023
@inproceedings{amariucai2023acquiring, address = {Singapore}, title = {Acquiring linguistic knowledge from multimodal input}, doi = {10.18653/v1/2023.conll-babylm.11}, booktitle = {Proceedings of the {BabyLM} challenge at the 27th conference on computational natural language learning}, publisher = {Association for Computational Linguistics}, author = {Amariucai, Theodor and Warstadt, Alexander Scott}, editor = {Warstadt, Alex and Mueller, Aaron and Choshen, Leshem and Wilcox, Ethan and Zhuang, Chengxu and Ciro, Juan and Mosquera, Rafael and Paranjabe, Bhargavi and Williams, Adina and Linzen, Tal and Cotterell, Ryan}, month = dec, year = {2023}, pages = {128--141}, }
- ACL FindingsReconstruction probingNajoung Kim, Jatin Khilnani, Alex Warstadt, and 1 more authorIn Findings of the association for computational linguistics: ACL 2023, Jul 2023
We propose reconstruction probing, a new analysis method for contextualized representations based on reconstruction probabilities in masked language models (MLMs). This method relies on comparing the reconstruction probabilities of tokens in a given sequence when conditioned on the representation of a single token that has been fully contextualized and when conditioned on only the decontextualized lexical prior of the model. This comparison can be understood as quantifying the contribution of contextualization towards reconstruction—the difference in the reconstruction probabilities can only be attributed to the representational change of the single token induced by contextualization. We apply this analysis to three MLMs and find that contextualization boosts reconstructability of tokens that are close to the token being reconstructed in terms of linear and syntactic distance. Furthermore, we extend our analysis to finer-grained decomposition of contextualized representations, and we find that these boosts are largely attributable to static and positional embeddings at the input layer.
@inproceedings{kim2023reconstruction, address = {Toronto, Canada}, title = {Reconstruction probing}, doi = {10.18653/v1/2023.findings-acl.523}, booktitle = {Findings of the association for computational linguistics: {ACL} 2023}, publisher = {Association for Computational Linguistics}, author = {Kim, Najoung and Khilnani, Jatin and Warstadt, Alex and Qaddoumi, Abdelrahim}, editor = {Rogers, Anna and Boyd-Graber, Jordan and Okazaki, Naoaki}, month = jul, year = {2023}, pages = {8240--8255}, }
- BabyLMProceedings of the BabyLM Challenge at the 27th Conference on Computational Natural Language LearningDec 2023
@proceedings{warstadt2023babylm, title = {Proceedings of the BabyLM Challenge at the 27th Conference on Computational Natural Language Learning}, editor = {Warstadt, Alex and Mueller, Aaron and Choshen, Leshem and Wilcox, Ethan and Zhuang, Chengxu and Ciro, Juan and Mosquera, Rafael and Paranjabe, Bhargavi and Williams, Adina and Linzen, Tal and Cotterell, Ryan}, month = dec, year = {2023}, address = {Singapore}, publisher = {Association for Computational Linguistics}, }
2022
- Book ChapterWhat artificial neural networks can tell us about human language acquisitionAlex Warstadt, and Samuel R BowmanIn Algebraic Structures in Natural Language, Dec 2022
Rapid progress in machine learning for natural language processing has the potential to transform debates about how humans learn language. However, the learning environments and biases of current artificial learners and humans diverge in ways that weaken the impact of the evidence obtained from learning simulations. For example, today’s most effective neural language models are trained on roughly one thousand times the amount of linguistic data available to a typical child. To increase the relevance of learnability results from computational models, we need to train model learners without significant advantages over humans. If an appropriate model successfully acquires some target linguistic knowledge, it can provide a proof of concept that the target is learnable in a hypothesized human learning scenario. Plausible model learners will enable us to carry out experimental manipulations to make causal inferences about variables in the learning environment, and to rigorously test poverty-of-the-stimulus-style claims arguing for innate linguistic knowledge in humans. Comparable experiments will never be possible with human subjects due to practical and ethical considerations. So far, attempts to deprive current models of unfair advantages fail to achieve human-level grammatical knowledge. But before we can justifiably conclude that language learning requires more prior domain-specific knowledge than current models possess, we must first explore other training regimes as ways to make computational learners more efficient at learning from limited linguistic input.
@incollection{warstadt2022what, title = {What artificial neural networks can tell us about human language acquisition}, booktitle = {Algebraic {Structures} in {Natural} {Language}}, publisher = {CRC Press}, author = {Warstadt, Alex and Bowman, Samuel R}, editor = {Lappin, Shalom and Bernardy, Jean-Philippe}, year = {2022}, pages = {17--60}, }
- Sinn und BedeutungTesting Bayesian measures of relevance in discourseAlex Warstadt, and Omar AghaIn Proceedings of sinn und bedeutung, Dec 2022
@inproceedings{warstadt2022testing, title = {Testing {Bayesian} measures of relevance in discourse}, volume = {26}, booktitle = {Proceedings of sinn und bedeutung}, author = {Warstadt, Alex and Agha, Omar}, year = {2022}, pages = {865--886}, }
- ThesisArtificial neural networks as models of human language acquisitionAlex WarstadtDec 2022
@phdthesis{warstadt2022artificial, type = {{PhD} {Thesis}}, title = {Artificial neural networks as models of human language acquisition}, school = {New York University}, author = {Warstadt, Alex}, year = {2022}, }
- Amsterdam ColloquiumPresupposition triggering reflects pragmatic reasoning about utterance utilityAlex WarstadtIn Proceedings of the 2022 Amsterdam Colloquium, Dec 2022
@inproceedings{warstadt2022presupposition, title = {Presupposition triggering reflects pragmatic reasoning about utterance utility}, booktitle = {Proceedings of the 2022 {Amsterdam} {Colloquium}}, publisher = {EasyChair}, author = {Warstadt, Alex}, year = {2022}, }
- CoNLLEntailment semantics can be extracted from an ideal language modelWilliam Merrill, Alex Warstadt, and Tal LinzenIn Proceedings of the 26th conference on computational natural language learning (CoNLL), Dec 2022
Language models are often trained on text alone, without additional grounding. There is debate as to how much of natural language semantics can be inferred from such a procedure. We prove that entailment judgments between sentences can be extracted from an ideal language model that has perfectly learned its target distribution, assuming the training sentences are generated by Gricean agents, i.e., agents who follow fundamental principles of communication from the linguistic theory of pragmatics. We also show entailment judgments can be decoded from the predictions of a language model trained on such Gricean data. Our results reveal a pathway for understanding the semantic information encoded in unlabeled linguistic data and a potential framework for extracting semantics from language models.
@inproceedings{merrill2022entailment, address = {Abu Dhabi, United Arab Emirates (Hybrid)}, title = {Entailment semantics can be extracted from an ideal language model}, doi = {10.18653/v1/2022.conll-1.13}, booktitle = {Proceedings of the 26th conference on computational natural language learning ({CoNLL})}, publisher = {Association for Computational Linguistics}, author = {Merrill, William and Warstadt, Alex and Linzen, Tal}, editor = {Fokkens, Antske and Srikumar, Vivek}, month = dec, year = {2022}, pages = {176--193}, }
- ACLWhat makes reading comprehension questions difficult?Saku Sugawara, Nikita Nangia, Alex Warstadt, and 1 more authorIn Proceedings of the 60th annual meeting of the association for computational linguistics (volume 1: Long papers), May 2022
For a natural language understanding benchmark to be useful in research, it has to consist of examples that are diverse and difficult enough to discriminate among current and near-future state-of-the-art systems. However, we do not yet know how best to select text sources to collect a variety of challenging examples. In this study, we crowdsource multiple-choice reading comprehension questions for passages taken from seven qualitatively distinct sources, analyzing what attributes of passages contribute to the difficulty and question types of the collected examples. To our surprise, we find that passage source, length, and readability measures do not significantly affect question difficulty. Through our manual annotation of seven reasoning types, we observe several trends between passage sources and reasoning types, e.g., logical reasoning is more often required in questions written for technical passages. These results suggest that when creating a new benchmark dataset, selecting a diverse set of passages can help ensure a diverse range of question types, but that passage difficulty need not be a priority.
@inproceedings{sugawara2022makes, address = {Dublin, Ireland}, title = {What makes reading comprehension questions difficult?}, doi = {10.18653/v1/2022.acl-long.479}, booktitle = {Proceedings of the 60th annual meeting of the association for computational linguistics (volume 1: {Long} papers)}, publisher = {Association for Computational Linguistics}, author = {Sugawara, Saku and Nangia, Nikita and Warstadt, Alex and Bowman, Samuel}, editor = {Muresan, Smaranda and Nakov, Preslav and Villavicencio, Aline}, month = may, year = {2022}, pages = {6951--6971}, }
- Diss. ChapterThe Role of Indirect Evidence in Grammar Learning: Investigations with Causal Manipulations of the Learning EnvironmentAlex WarstadtIn Artificial neural networks as models of human language acquisition, May 2022
@incollection{warstadt2022indirect, title = {The {Role} of {Indirect} {Evidence} in {Grammar} {Learning}: {Investigations} with {Causal} {Manipulations} of the {Learning} {Environment}}, volume = {Chapter 6}, booktitle = {Artificial neural networks as models of human language acquisition}, publisher = {PhD Dissertation, New York University}, author = {Warstadt, Alex}, year = {2022}, }
2021
- ACLWhen Do You Need Billions of Words of Pretraining Data?Yian Zhang, Alex Warstadt, Xiaocheng Li, and 1 more authorIn Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), Aug 2021
NLP is currently dominated by language models like RoBERTa which are pretrained on billions of words. But what exact knowledge or skills do Transformer LMs learn from large-scale pretraining that they cannot learn from less data? To explore this question, we adopt five styles of evaluation: classifier probing, information-theoretic probing, unsupervised relative acceptability judgments, unsupervised language model knowledge probing, and fine-tuning on NLU tasks. We then draw learning curves that track the growth of these different measures of model ability with respect to pretraining data volume using the MiniBERTas, a group of RoBERTa models pretrained on 1M, 10M, 100M and 1B words. We find that these LMs require only about 10M to 100M words to learn to reliably encode most syntactic and semantic features we test. They need a much larger quantity of data in order to acquire enough commonsense knowledge and other skills required to master typical downstream NLU tasks. The results suggest that, while the ability to encode linguistic features is almost certainly necessary for language understanding, it is likely that other, unidentified, forms of knowledge are the major drivers of recent improvements in language understanding among large pretrained models.
@inproceedings{zhang2021when, address = {Online}, title = {When {Do} {You} {Need} {Billions} of {Words} of {Pretraining} {Data}?}, doi = {10.18653/v1/2021.acl-long.90}, webdate = {2021-09-17}, booktitle = {Proceedings of the 59th {Annual} {Meeting} of the {Association} for {Computational} {Linguistics} and the 11th {International} {Joint} {Conference} on {Natural} {Language} {Processing} ({Volume} 1: {Long} {Papers})}, publisher = {Association for Computational Linguistics}, author = {Zhang, Yian and Warstadt, Alex and Li, Xiaocheng and Bowman, Samuel R.}, month = aug, year = {2021}, pages = {1112--1125}, }
- CoNLLNOPE: a corpus of naturally-occurring presuppositions in EnglishAlicia Parrish, Sebastian Schuster, Alex Warstadt, and 5 more authorsIn Proceedings of the 25th conference on computational natural language learning, Nov 2021
Understanding language requires grasping not only the overtly stated content, but also making inferences about things that were left unsaid. These inferences include presuppositions, a phenomenon by which a listener learns about new information through reasoning about what a speaker takes as given. Presuppositions require complex understanding of the lexical and syntactic properties that trigger them as well as the broader conversational context. In this work, we introduce the Naturally-Occurring Presuppositions in English (NOPE) Corpus to investigate the context-sensitivity of 10 different types of presupposition triggers and to evaluate machine learning models’ ability to predict human inferences. We find that most of the triggers we investigate exhibit moderate variability. We further find that transformer-based models draw correct inferences in simple cases involving presuppositions, but they fail to capture the minority of exceptional cases in which human judgments reveal complex interactions between context and triggers.
@inproceedings{parrish2021nope, address = {Online}, title = {{NOPE}: a corpus of naturally-occurring presuppositions in {English}}, doi = {10.18653/v1/2021.conll-1.28}, booktitle = {Proceedings of the 25th conference on computational natural language learning}, publisher = {Association for Computational Linguistics}, author = {Parrish, Alicia and Schuster, Sebastian and Warstadt, Alex and Agha, Omar and Lee, Soo-Hwan and Zhao, Zhuoye and Bowman, Samuel R. and Linzen, Tal}, editor = {Bisazza, Arianna and Abend, Omri}, month = nov, year = {2021}, pages = {349--366}, }
- ACLWhat ingredients make for an effective crowdsourcing protocol for difficult NLU data collection tasks?Nikita Nangia, Saku Sugawara, Harsh Trivedi, and 3 more authorsIn Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (volume 1: Long papers), Aug 2021
Crowdsourcing is widely used to create data for common natural language understanding tasks. Despite the importance of these datasets for measuring and refining model understanding of language, there has been little focus on the crowdsourcing methods used for collecting the datasets. In this paper, we compare the efficacy of interventions that have been proposed in prior work as ways of improving data quality. We use multiple-choice question answering as a testbed and run a randomized trial by assigning crowdworkers to write questions under one of four different data collection protocols. We find that asking workers to write explanations for their examples is an ineffective stand-alone strategy for boosting NLU example difficulty. However, we find that training crowdworkers, and then using an iterative process of collecting data, sending feedback, and qualifying workers based on expert judgments is an effective means of collecting challenging data. But using crowdsourced, instead of expert judgments, to qualify workers and send feedback does not prove to be effective. We observe that the data from the iterative protocol with expert assessments is more challenging by several measures. Notably, the human–model gap on the unanimous agreement portion of this data is, on average, twice as large as the gap for the baseline protocol data.
@inproceedings{nangia2021ingredients, address = {Online}, title = {What ingredients make for an effective crowdsourcing protocol for difficult {NLU} data collection tasks?}, doi = {10.18653/v1/2021.acl-long.98}, booktitle = {Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (volume 1: {Long} papers)}, publisher = {Association for Computational Linguistics}, author = {Nangia, Nikita and Sugawara, Saku and Trivedi, Harsh and Warstadt, Alex and Vania, Clara and Bowman, Samuel R.}, editor = {Zong, Chengqing and Xia, Fei and Li, Wenjie and Navigli, Roberto}, month = aug, year = {2021}, pages = {1221--1235}, }
- EMNLP FindingsDoes putting a linguist in the loop improve NLU data collection?Alicia Parrish, William Huang, Omar Agha, and 7 more authorsIn Findings of the association for computational linguistics: EMNLP 2021, Nov 2021
Many crowdsourced NLP datasets contain systematic artifacts that are identified only after data collection is complete. Earlier identification of these issues should make it easier to create high-quality training and evaluation data. We attempt this by evaluating protocols in which expert linguists work ‘in the loop’ during data collection to identify and address these issues by adjusting task instructions and incentives. Using natural language inference as a test case, we compare three data collection protocols: (i) a baseline protocol with no linguist involvement, (ii) a linguist-in-the-loop intervention with iteratively-updated constraints on the writing task, and (iii) an extension that adds direct interaction between linguists and crowdworkers via a chatroom. We find that linguist involvement does not lead to increased accuracy on out-of-domain test sets compared to baseline, and adding a chatroom has no effect on the data. Linguist involvement does, however, lead to more challenging evaluation data and higher accuracy on some challenge sets, demonstrating the benefits of integrating expert analysis during data collection.
@inproceedings{parrish2021linguist, address = {Punta Cana, Dominican Republic}, title = {Does putting a linguist in the loop improve {NLU} data collection?}, doi = {10.18653/v1/2021.findings-emnlp.421}, booktitle = {Findings of the association for computational linguistics: {EMNLP} 2021}, publisher = {Association for Computational Linguistics}, author = {Parrish, Alicia and Huang, William and Agha, Omar and Lee, Soo-Hwan and Nangia, Nikita and Warstadt, Alexia and Aggarwal, Karmanya and Allaway, Emily and Linzen, Tal and Bowman, Samuel R.}, editor = {Moens, Marie-Francine and Huang, Xuanjing and Specia, Lucia and Yih, Scott Wen-tau}, month = nov, year = {2021}, pages = {4886--4901}, }
- EACLCLiMP: a benchmark for Chinese language model evaluationBeilei Xiang, Changbing Yang, Yu Li, and 2 more authorsIn Proceedings of the 16th conference of the european chapter of the association for computational linguistics: Main volume, Apr 2021
Linguistically informed analyses of language models (LMs) contribute to the understanding and improvement of such models. Here, we introduce the corpus of Chinese linguistic minimal pairs (CLiMP) to investigate what knowledge Chinese LMs acquire. CLiMP consists of sets of 1000 minimal pairs (MPs) for 16 syntactic contrasts in Chinese, covering 9 major Chinese linguistic phenomena. The MPs are semi-automatically generated, and human agreement with the labels in CLiMP is 95.8%. We evaluate 11 different LMs on CLiMP, covering n-grams, LSTMs, and Chinese BERT. We find that classifier–noun agreement and verb complement selection are the phenomena that models generally perform best at. However, models struggle the most with the ba construction, binding, and filler-gap dependencies. Overall, Chinese BERT achieves an 81.8% average accuracy, while the performances of LSTMs and 5-grams are only moderately above chance level.
@inproceedings{xiang2021climp, address = {Online}, title = {{CLiMP}: a benchmark for {Chinese} language model evaluation}, doi = {10.18653/v1/2021.eacl-main.242}, booktitle = {Proceedings of the 16th conference of the european chapter of the association for computational linguistics: {Main} volume}, publisher = {Association for Computational Linguistics}, author = {Xiang, Beilei and Yang, Changbing and Li, Yu and Warstadt, Alex and Kann, Katharina}, editor = {Merlo, Paola and Tiedemann, Jorg and Tsarfaty, Reut}, month = apr, year = {2021}, pages = {2784--2790}, }
2020
- ACLLearning which features matter: RoBERTa acquires a preference for linguistic generalizations (eventually)Alex Warstadt, Yian Zhang, Xiaocheng Li, and 2 more authorsIn Proceedings of the 2020 conference on empirical methods in natural language processing (EMNLP), Nov 2020
One reason pretraining on self-supervised linguistic tasks is effective is that it teaches models features that are helpful for language understanding. However, we want pretrained models to learn not only to represent linguistic features, but also to use those features preferentially during fine-turning. With this goal in mind, we introduce a new English-language diagnostic set called MSGS (the Mixed Signals Generalization Set), which consists of 20 ambiguous binary classification tasks that we use to test whether a pretrained model prefers linguistic or surface generalizations during finetuning. We pretrain RoBERTa from scratch on quantities of data ranging from 1M to 1B words and compare their performance on MSGS to the publicly available RoBERTa_BASE. We find that models can learn to represent linguistic features with little pretraining data, but require far more data to learn to prefer linguistic generalizations over surface ones. Eventually, with about 30B words of pretraining data, RoBERTa_BASE does consistently demonstrate a linguistic bias with some regularity. We conclude that while self-supervised pretraining is an effective way to learn helpful inductive biases, there is likely room to improve the rate at which models learn which features matter.
@inproceedings{warstadt2020learning, address = {Online}, title = {Learning which features matter: {RoBERTa} acquires a preference for linguistic generalizations (eventually)}, doi = {10.18653/v1/2020.emnlp-main.16}, booktitle = {Proceedings of the 2020 conference on empirical methods in natural language processing ({EMNLP})}, publisher = {Association for Computational Linguistics}, author = {Warstadt, Alex and Zhang, Yian and Li, Xiaocheng and Liu, Haokun and Bowman, Samuel R.}, month = nov, year = {2020}, pages = {217--235}, }
- Sinn und BedeutungNon-resolving responses to polar questions: A revision to the QUD theory of relevanceOmar Agha, and Alex WarstadtIn Proceedings of Sinn und Bedeutung, Sep 2020
The influential Question Under Discussion (QUD) theory of discourse (Roberts, 2012) formal- izes Grice’s notion of relevance. In this paper, we identify a class of relevant discourse moves where Roberts’s account undergenerates, and propose a more inclusive definition of relevance. For example, if asked Should we cancel the picnic?, one can reply If it rains without fully resolving the question. However, in Roberts’s theory, all relevant responses to polar questions are predicted to fully resolve the question because a relevant answer must eliminate at least one alternative in the QUD. We propose that a non-resolving response to a polar question is relevant if it eliminates a set of worlds that overlaps with only some alternatives in the QUD. The new account turns out to make good predictions in the domain of polar questions, and beyond.
@inproceedings{agha2020nonresolving, title = {Non-resolving responses to polar questions: {A} revision to the {QUD} theory of relevance}, volume = {24}, shorttitle = {Non-resolving responses to polar questions}, doi = {10.18148/sub/2020.v24i1.850}, language = {english}, booktitle = {Proceedings of {Sinn} und {Bedeutung}}, author = {Agha, Omar and Warstadt, Alex}, month = sep, year = {2020}, pages = {17--34}, }
- TACLBLiMP: The Benchmark of Linguistic Minimal Pairs for EnglishAlex Warstadt, Alicia Parrish, Haokun Liu, and 4 more authorsTransactions of the Association for Computational Linguistics, Sep 2020
We introduce The Benchmark of Linguistic Minimal Pairs (BLiMP),1 a challenge set for evaluating the linguistic knowledge of language models (LMs) on major grammatical phenomena in English. BLiMP consists of 67 individual datasets, each containing 1,000 minimal pairs—that is, pairs of minimally different sentences that contrast in grammatical acceptability and isolate specific phenomenon in syntax, morphology, or semantics. We generate the data according to linguist-crafted grammar templates, and human aggregate agreement with the labels is 96.4%. We evaluate n-gram, LSTM, and Transformer (GPT-2 and Transformer-XL) LMs by observing whether they assign a higher probability to the acceptable sentence in each minimal pair. We find that state-of-the-art models identify morphological contrasts related to agreement reliably, but they struggle with some subtle semantic and syntactic phenomena, such as negative polarity items and extraction islands.
@article{warstadt2020blimp, title = {{BLiMP}: {The} {Benchmark} of {Linguistic} {Minimal} {Pairs} for {English}}, volume = {8}, doi = {10.1162/tacl_a_00321}, journal = {Transactions of the Association for Computational Linguistics}, author = {Warstadt, Alex and Parrish, Alicia and Liu, Haokun and Mohananey, Anhad and Peng, Wei and Wang, Sheng-Fu and Bowman, Samuel R.}, year = {2020}, pages = {377--392}, }
- Sinn und Bedeutung"Just" don’t ask: Exclusives and potential questionsAlex WarstadtIn Proceedings of Sinn und Bedeutung, Sep 2020
The English exclusive just is not synonymous with other exclusives such as only in sentences like Sometimes, bad things just/only happen. I give a new analysis of just which explains this and other puzzling readings of just observed in earlier work (e.g. Wiegand, 2016; Beltrama, 2018). I argue that just excludes alternatives derived from a potential question, or possible future QUD, in the sense of Onea (2016). This new perspective makes it possible to give the first unified account of these non-canonical exclusive readings of just, and provides evidence that the semantics of lexical items can be sensitive to possible futures of the discourse.
@inproceedings{warstadt2020just, title = {"{Just}" don’t ask: {Exclusives} and potential questions}, volume = {24}, doi = {10.18148/sub/2020.v24i2.903}, booktitle = {Proceedings of {Sinn} und {Bedeutung}}, author = {Warstadt, Alex}, month = sep, year = {2020}, pages = {373--390}, }
- CogSciCan neural networks acquire a structural bias from raw linguistic data?Alex Warstadt, and Samuel R BowmanIn Proceedings of the 42nd Annual Conference of the Cognitive Science Society., Sep 2020
@inproceedings{warstadt2020can, title = {Can neural networks acquire a structural bias from raw linguistic data?}, booktitle = {Proceedings of the 42nd {Annual} {Conference} of the {Cognitive} {Science} {Society}.}, author = {Warstadt, Alex and Bowman, Samuel R}, year = {2020}, }
- ACLAre natural language inference models IMPPRESsive? Learning IMPlicature and PRESuppositionPaloma Jeretic, Alex Warstadt, Suvrat Bhooshan, and 1 more authorIn Proceedings of the 58th annual meeting of the association for computational linguistics, Jul 2020
Natural language inference (NLI) is an increasingly important task for natural language understanding, which requires one to infer whether a sentence entails another. However, the ability of NLI models to make pragmatic inferences remains understudied. We create an IMPlicature and PRESupposition diagnostic dataset (IMPPRES), consisting of 32K semi-automatically generated sentence pairs illustrating well-studied pragmatic inference types. We use IMPPRES to evaluate whether BERT, InferSent, and BOW NLI models trained on MultiNLI (Williams et al., 2018) learn to make pragmatic inferences. Although MultiNLI appears to contain very few pairs illustrating these inference types, we find that BERT learns to draw pragmatic inferences. It reliably treats scalar implicatures triggered by “some” as entailments. For some presupposition triggers like “only”, BERT reliably recognizes the presupposition as an entailment, even when the trigger is embedded under an entailment canceling operator like negation. BOW and InferSent show weaker evidence of pragmatic reasoning. We conclude that NLI training encourages models to learn some, but not all, pragmatic inferences.
@inproceedings{jeretic2020natural, address = {Online}, title = {Are natural language inference models {IMPPRESsive}? {Learning} {IMPlicature} and {PRESupposition}}, doi = {10.18653/v1/2020.acl-main.768}, booktitle = {Proceedings of the 58th annual meeting of the association for computational linguistics}, publisher = {Association for Computational Linguistics}, author = {Jeretic, Paloma and Warstadt, Alex and Bhooshan, Suvrat and Williams, Adina}, editor = {Jurafsky, Dan and Chai, Joyce and Schluter, Natalie and Tetreault, Joel}, month = jul, year = {2020}, pages = {8690--8705}, }
2019
- SCiLVerb Argument Structure Alternations in Word and Sentence EmbeddingsKatharina Kann, Alex Warstadt, Adina Williams, and 1 more authorIn Proceedings of the Society for Computation in Linguistics (SCiL) 2019, Jul 2019
@inproceedings{kann2019verb, title = {Verb {Argument} {Structure} {Alternations} in {Word} and {Sentence} {Embeddings}}, doi = {10.7275/q5js-4y86}, booktitle = {Proceedings of the {Society} for {Computation} in {Linguistics} ({SCiL}) 2019}, author = {Kann, Katharina and Warstadt, Alex and Williams, Adina and Bowman, Samuel R.}, year = {2019}, pages = {287--297}, }
- ManuscriptLinguistic Analysis of Pretrained Sentence Encoders with Acceptability JudgmentsAlex Warstadt, and Samuel R. BowmanJul 2019
@unpublished{warstadt2019linguistic, title = {Linguistic {Analysis} of {Pretrained} {Sentence} {Encoders} with {Acceptability} {Judgments}}, author = {Warstadt, Alex and Bowman, Samuel R.}, year = {2019}, }
- EMNLPInvestigating BERT’s knowledge of language: Five analysis methods with NPIsAlex Warstadt, Yu Cao, Ioana Grosu, and 13 more authorsIn Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th international joint conference on natural language processing (EMNLP-IJCNLP), Nov 2019
Though state-of-the-art sentence representation models can perform tasks requiring significant knowledge of grammar, it is an open question how best to evaluate their grammatical knowledge. We explore five experimental methods inspired by prior work evaluating pretrained sentence representation models. We use a single linguistic phenomenon, negative polarity item (NPI) licensing, as a case study for our experiments. NPIs like any are grammatical only if they appear in a licensing environment like negation (Sue doesn’t have any cats vs. *Sue has any cats). This phenomenon is challenging because of the variety of NPI licensing environments that exist. We introduce an artificially generated dataset that manipulates key features of NPI licensing for the experiments. We find that BERT has significant knowledge of these features, but its success varies widely across different experimental methods. We conclude that a variety of methods is necessary to reveal all relevant aspects of a model’s grammatical knowledge in a given domain.
@inproceedings{warstadt2019investigating, address = {Hong Kong, China}, title = {Investigating {BERT}'s knowledge of language: {Five} analysis methods with {NPIs}}, doi = {10.18653/v1/D19-1286}, booktitle = {Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th international joint conference on natural language processing ({EMNLP}-{IJCNLP})}, publisher = {Association for Computational Linguistics}, author = {Warstadt, Alex and Cao, Yu and Grosu, Ioana and Peng, Wei and Blix, Hagen and Nie, Yining and Alsop, Anna and Bordia, Shikha and Liu, Haokun and Parrish, Alicia and Wang, Sheng-Fu and Phang, Jason and Mohananey, Anhad and Htut, Phu Mon and Jeretic, Paloma and Bowman, Samuel R.}, month = nov, year = {2019}, pages = {2877--2887}, }
- TACLNeural network acceptability judgmentsAlex Warstadt, Amanpreet Singh, and Samuel R. BowmanTransactions of the Association for Computational Linguistics, Nov 2019
This paper investigates the ability of artificial neural networks to judge the grammatical acceptability of a sentence, with the goal of testing their linguistic competence. We introduce the Corpus of Linguistic Acceptability (CoLA), a set of 10,657 English sentences labeled as grammatical or ungrammatical from published linguistics literature. As baselines, we train several recurrent neural network models on acceptability classification, and find that our models outperform unsupervised models by Lau et al. (2016) on CoLA. Error-analysis on specific grammatical phenomena reveals that both Lau et al.’s models and ours learn systematic generalizations like subject-verb-object order. However, all models we test perform far below human level on a wide range of grammatical constructions.
@article{warstadt2019neural, title = {Neural network acceptability judgments}, volume = {7}, doi = {10.1162/tacl_a_00290}, journal = {Transactions of the Association for Computational Linguistics}, author = {Warstadt, Alex and Singh, Amanpreet and Bowman, Samuel R.}, editor = {Lee, Lillian and Johnson, Mark and Roark, Brian and Nenkova, Ani}, year = {2019}, pages = {625--641}, }
2018
2015
- ESSLLIRight-node wrapping: A combinatory accountAlexander WarstadtIn Proceedings for ESSLLI 2015 Workshop ‘Empirical Advances in Categorial Grammar, Nov 2015
@inproceedings{warstadt2015rightnode, title = {Right-node wrapping: {A} combinatory account}, booktitle = {Proceedings for {ESSLLI} 2015 {Workshop} `{Empirical} {Advances} in {Categorial} {Grammar}}, publisher = {Citeseer}, author = {Warstadt, Alexander}, year = {2015}, pages = {183--207}, }