k@InProceedings{Monfort_2021_CVPR,
    author    = {Monfort, Mathew and Jin, SouYoung and Liu, Alexander and Harwath, David and Feris, Rogerio and Glass, James and Oliva, Aude},
    title     = {Spoken Moments: Learning Joint Audio-Visual Representations From Video Descriptions},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
    month     = {June},
    year      = {2021},
    pages     = {14871-14881}
}
