publications
2025
- BabyBabelLM: A Multilingual Benchmark of Developmentally Plausible Training DataJaap Jumelet, Abdellah Fourtassi, Akari Haga, Bastian Bunzeck, Bhargav Shandilya, Diana Galvan-Sosa, Faiz Ghifari Haznitrama, Francesca Padovani, Francois Meyer, Hai Hu, Julen Etxaniz, Laurent Prévot, Linyang He, María Grandury, Mila Marcheva, Negar Foroutan, Nikitas Theodoropoulos, Pouya Sadeghi, Siyuan Song, Suchir Salhan, Susana Zhou, Yurii Paniv, Ziyin Zhang, Arianna Bisazza, Alex Warstadt, and Leshem Choshen2025
We present BabyBabelLM, a multilingual collection of datasets modeling the language a person observes from birth until they acquire a native language. We curate developmentally plausible pretraining data aiming to cover the equivalent of 100M English words of content in each of 45 languages. We compile evaluation suites and train baseline models in each language. BabyBabelLM aims to facilitate multilingual pretraining and cognitive modeling.
@misc{babybabellm, title = {BabyBabelLM: A Multilingual Benchmark of Developmentally Plausible Training Data}, author = {Jumelet, Jaap and Fourtassi, Abdellah and Haga, Akari and Bunzeck, Bastian and Shandilya, Bhargav and Galvan-Sosa, Diana and Haznitrama, Faiz Ghifari and Padovani, Francesca and Meyer, Francois and Hu, Hai and Etxaniz, Julen and Prévot, Laurent and He, Linyang and Grandury, María and Marcheva, Mila and Foroutan, Negar and Theodoropoulos, Nikitas and Sadeghi, Pouya and Song, Siyuan and Salhan, Suchir and Zhou, Susana and Paniv, Yurii and Zhang, Ziyin and Bisazza, Arianna and Warstadt, Alex and Choshen, Leshem}, year = {2025}, eprint = {2510.10159}, archiveprefix = {arXiv}, primaryclass = {cs.CL}, url = {https://arxiv.org/abs/2510.10159}, }
2024
- BERTtime Stories: Investigating the Role of Synthetic Story Data in Language Pre-trainingNikitas Theodoropoulos, Giorgos Filandrianos, Vassilis Lyberatos, Maria Lymperaiou, and Giorgos StamouIn The 2nd BabyLM Challenge at the 28th Conference on Computational Natural Language Learning, Nov 2024
We describe our contribution to the Strict and Strict-Small tracks of the 2nd iteration of the BabyLM Challenge. The shared task is centered around efficient pre-training given data constraints motivated by human development. In response, we study the effect of synthetic story data in language pre-training using *TinyStories*: a recently introduced dataset of short stories. Initially, we train GPT-Neo models on subsets of *TinyStories*, while varying the amount of available data. We find that, even with access to less than 100M words, the models are able to generate high-quality, original completions to a given story, and acquire substantial linguistic knowledge. To measure the effect of synthetic story data, we train *LTG-BERT* encoder models on a combined dataset of: a subset of *TinyStories*, story completions generated by GPT-Neo, and a subset of the *BabyLM* dataset. Our experimentation reveals that synthetic data can occasionally offer modest gains, but overall have a negative influence on linguistic understanding. Our work offers an initial study on synthesizing story data in low resource settings and underscores their potential for augmentation in data-constrained language modeling. We publicly release our models and implementation on our GitHub.
@inproceedings{theodoropoulos-etal-2024-berttime, title = {{BERT}time Stories: Investigating the Role of Synthetic Story Data in Language Pre-training}, author = {Theodoropoulos, Nikitas and Filandrianos, Giorgos and Lyberatos, Vassilis and Lymperaiou, Maria and Stamou, Giorgos}, editor = {Hu, Michael Y. and Mueller, Aaron and Ross, Candace and Williams, Adina and Linzen, Tal and Zhuang, Chengxu and Choshen, Leshem and Cotterell, Ryan and Warstadt, Alex and Wilcox, Ethan Gotlieb}, booktitle = {The 2nd BabyLM Challenge at the 28th Conference on Computational Natural Language Learning}, month = nov, year = {2024}, address = {Miami, FL, USA}, publisher = {Association for Computational Linguistics}, acl = {https://aclanthology.org/2024.conll-babylm.28/}, pages = {308--323}, }
2022
- From {Solution} Synthesis to {Student Attempt} Synthesis for Block-Based Visual Programming TasksAdish Singla, and Nikitas TheodoropoulosIn Proceedings of the 15th International Conference on Educational Data Mining (EDM), Jul 2022
Block-based visual programming environments are increasingly used to introduce computing concepts to beginners. Given that programming tasks are open-ended and conceptual, novice students often struggle when learning in these environments. AI-driven programming tutors hold great promise in automatically assisting struggling students, and need several components to realize this potential. We investigate the crucial component of student modeling, in particular, the ability to automatically infer students’ misconceptions for predicting (synthesizing) their behavior. We introduce a novel benchmark, StudentSyn, centered around the following challenge: For a given student, synthesize the student’s attempt on a new target task after observing the student’s attempt on a fixed reference task. This challenge is akin to that of program synthesis; however, instead of synthesizing a solution (i.e., program an expert would write), the goal here is to synthesize a student attempt (i.e., program that a given student would write). We first show that human experts (TutorSS) can achieve high performance on the benchmark, whereas simple baselines perform poorly. Then, we develop two neuro/symbolic techniques (NeurSS and SymSS) in a quest to close this gap with TutorSS. We will publicly release the benchmark to facilitate future research in this area.
@inproceedings{2022.EDM-short-papers.45, address = {Durham, United Kingdom}, author = {Singla, Adish and Theodoropoulos, Nikitas}, booktitle = {Proceedings of the 15th International Conference on Educational Data Mining (EDM)}, editor = {Mitrovic, Antonija and Bosch, Nigel}, isbn = {978-1-7336736-3-1}, month = jul, pages = {454--461}, publisher = {International Educational Data Mining Society}, title = {From \{Solution\} Synthesis to \{Student Attempt\} Synthesis for Block-Based Visual Programming Tasks}, year = {2022}, }